Exemplo n.º 1
0
def make_score_hold(factory,seasons,score_name,compare=(lambda x,y:x!=y),modelFactory=linear_model.LogisticRegression):
	train_vector,train_ids,targets,test_vector,test_ids = factory() 
	projects 	= utils.read_csv('projects.csv')[['projectid','date_posted']]	
	train_idx 	= { pid:i for i,pid in enumerate(train_ids) }
	projects['idx'] = projects.projectid.map(train_idx)
	projects.date_posted = [ '%s-%02d-01'%(d[:4],(int(d[5:7])-1)/seasons+1) for d in projects.date_posted ]
	projects.date_posted[ projects.date_posted <  '2008-01-01' ] = '2007-12-01'
	projects.date_posted[ projects.date_posted >= '2014-01-01' ] = '2014-01-01'
	training = projects[ projects.idx.notnull() ]
	outcomes = utils.read_csv('outcomes.csv')[['projectid','is_exciting']]
	training = pd.merge(training,outcomes)
	def handle(x):
		day = x.date_posted.iloc[0]
		logging.info('handling %s of shape %s'%(day,x.shape))
		idx = compare(training.date_posted,day) 
		if idx.sum()==0 :
			x[score_name] = -1
			return x
		train_x = train_vector[training.idx[idx]]
		train_y = training.is_exciting[idx]
		if train_y.sum()==0 :
			x[score_name] = -1
			return x
		model = modelFactory()
		model.fit(train_x,train_y)
		pred_x = test_vector if day == '2014-01-01' else train_vector[list(training.idx[training.date_posted==day])]
		x[score_name] = model.predict_proba(pred_x)[:,1]
		return x
	scores = projects.groupby('date_posted').apply(handle).reset_index()[['projectid',score_name]]
	return scores
Exemplo n.º 2
0
def load_data(lines_file,conversations_file) :
    # construct dico lineID -> {characterid,charactername,movieid,text}
    lines_dico={}
    for line in utils.read_csv(lines_file,' +++$+++ ') :
        lines_dico[line[0]]={'characterid':line[1],'charactername':line[3],'movieid':line[2],'text':line[4]}
    # construct the film dico : id -> [dialogues = [text1,text2...]]
    films={}
    for dialogue in utils.read_csv(conversations_file,' +++$+++ ') :
        filmid=dialogue[2]
        if filmid not in films : films[filmid]=[]
        #print dialogue[3]
        #lineids = json.loads(dialogue[3])# issue with json string ?
        lineids = utils.parse_json_array(dialogue[3])
        films[filmid].append([lines_dico[lineid]['text'] for lineid in lineids])
    return films
Exemplo n.º 3
0
def convert_metadata(f,t):
    res=[]
    for line in utils.read_csv(f,' +++$+++ ') :
        filmid=line[0];filmtitle=line[1];filmyear=line[2][:4];filmrating=line[3];filmratingnum=line[4]
        for filmtype in utils.parse_json_array(line[5]):
            res.append([filmid,filmtitle,filmyear,filmrating,filmratingnum,filmtype])
    utils.export_csv(res,t,';')
Exemplo n.º 4
0
def _feature_020():
	'''
		basic infos from resource.csv
		@return id => [resouceid_cnt,vendor_cnt,project_resource_type_cnt,
				item_quantity_sum,item_quantity_max,
				item_unit_price_mean,item_unit_price_max
				]
	'''
	resources = utils.read_csv('resources.csv')
	gp = resources.groupby('projectid')
	def mapper(x):
		return pd.Series([
			x.shape[0],
			x.vendorid.unique().shape[0],
			x.project_resource_type.unique().shape[0],
			x.item_quantity.sum(),x.item_quantity.max(),
			x.item_unit_price.mean(),x.item_unit_price.max(),
		])
	data = gp.apply(mapper)
	columns = [	"resouceid_cnt","vendor_cnt","project_resource_type_cnt",
				"item_quantity_sum","item_quantity_max",
			    "item_unit_price_mean","item_unit_price_max"]
	data.columns = columns
	data = data.reset_index()
	return data,columns
def dedupe_licenses(licenses_by_country):
    all_licenses = reduce(set.union,
                          ((n for n, c in v if c > 0)
                           for *_, v in licenses_by_country),
                          set())
    all_licenses = ((i, i.lower()) for i in sorted(all_licenses, key=str.lower))
    csv_path = str(Path(__file__).parent/'data'/'license_mappings.csv')
    existing_keys = {k for k, _ in read_csv(csv_path)}
    with open(csv_path, 'a') as file:
        csv_writer(file).writerows((a, b) for a, b in all_licenses
                                   if a not in existing_keys)
    with open(str(Path(__file__).parent/'data'/'license_details.yaml'), 'w') \
            as file:
        yaml.safe_dump({c: l for c, l, _ in licenses_by_country}, file,
                       allow_unicode=True, default_flow_style=False)
    input('Press any key to continue')  # Pause before reloading the CSV
    return dict(read_csv(csv_path))
Exemplo n.º 6
0
def feature_008d(feature):
	''' @return id => sparse encoded text lengths in essays.csv '''
	essays = utils.read_csv('essays.csv')   #pd.read_csv(os.path.join(utils.DATA_DIR,'essays.csv'))
	data = pd.DataFrame(feature.projectid)
	columns = ['title','short_description','need_statement','essay']
	for c in columns :
		data['length_%s'%(c)] = essays[c].fillna('').map(len)
	dimensions = { 'length_%s'%(c):1 for c in columns }
	return data,dimensions
Exemplo n.º 7
0
def feature_008(feature,dim=40,step=0.2):
	''' @return id => sparse encoded text lengths in essays.csv '''
	essays = utils.read_csv('essays.csv')   #pd.read_csv(os.path.join(utils.DATA_DIR,'essays.csv'))
	data = pd.DataFrame(feature.projectid)
	columns = ['title','short_description','need_statement','essay']
	for c in columns :
		data['length_%s'%(c)] = essays[c].fillna('').map(len)
	columns = [ 'length_%s'%(c) for c in columns ]
	return sparse_encoder_002(data,columns,dim,step)
 def diff_grade(self, diff_grade, class_name):
     """returns the difference between the current class grade
     and the last one
     """
     diff = ''
     for name, grade, date in reversed(utils.read_csv('data.csv')):
         if name == class_name:
             return float(grade) - float(diff_grade)
     return 0.0
Exemplo n.º 9
0
def to_tfidf_vector(field,max_df=0.5,min_df=2,max_features=5000):
	'''
	@return train_vectors,train_ids,targets,test_vectors,test_ids
	'''
	df = utils.read_csv('essays.csv')
	outcomes = utils.read_csv('outcomes.csv')[['projectid','is_exciting']]
	df = pd.merge(df,outcomes,how='left')
	texts = df[field].fillna('')
	model  = TfidfVectorizer(max_df=max_df,min_df=min_df,max_features=max_features)
	model.fit(texts)
	train_idx,test_idx = df.is_exciting.notnull(),df.is_exciting.isnull()

	train_texts = texts[train_idx]
	test_texts	= texts[test_idx]
	train_ids   = df.projectid[train_idx]
	test_ids 	= df.projectid[test_idx]
	targets		= df.is_exciting[train_idx]
	train_vector = model.transform(train_texts)
	test_vector = model.transform(test_texts)
	return train_vector,train_ids,targets,test_vector,test_ids
def main(file_in, file_out):
    own_rows = [r for r in read_csv('portals.csv', has_header=True)[1]
                if r['url'] and r['url'] != 'N/A']

    fields, rows = read_csv(file_in, has_header=True)
    rows = sorted(chain(rows,
                        ({'name': format_name(r['url']),
                          'title': r['title'],
                          'url': r['url'],
                          'publisher': format_publisher(r['presiding_body']),
                          'publisher_classification': 'Government',
                          'tags': ' '.join(format_tags(r)),
                          'country': r['country_code'],
                          'generator': r['software_platform'],
                          'api_endpoint': r['metadata_api_endpoint']}
                         for r in own_rows)), key=lambda r: r['name'])
    with open(file_out, 'w') as file:
        writer = csv_dict_writer(file, fields)
        writer.writeheader()
        writer.writerows(rows)
Exemplo n.º 11
0
def prepare_data(filename,columns,target_columns=['is_exciting'],fillna=0,since=None):
	feature = utils.read_csv(filename)
	if filename != 'projects.csv' :
		pids = utils.read_csv('projects.csv')[['projectid','date_posted']]
		feature = pd.merge(pids,feature,how='left')
	data_target_columns = [ c for c in target_columns if c in feature.columns ]
	outcome_target_columns = [ c for c in target_columns if c not in feature.columns ]
	data = feature[['projectid','date_posted']+columns+data_target_columns]
	if since != None:
		data = data[data.date_posted>=since]
	logging.info('prepare_data begining, data shape=%s'%(data.shape,))
	if len(outcome_target_columns) > 0 :
		outcomes = utils.read_csv('outcomes.csv')
		outcomes = outcomes[['projectid']+outcome_target_columns] 
		for c in outcomes.columns:
			if c != 'projectid':
				outcomes[c] = map(int,outcomes[c].fillna(0))
		data = pd.merge(data,outcomes,how='left')
		if fillna != None:
			data = data.fillna(fillna)
	return data
 def diff_grade_custom(self, comp_grade, class_name, comp_date):
     """returns the difference between the current class grade and the grade from the provided
     timedelta
     """
     diff = ''
     for entry in reversed(utils.read_csv('data.csv')):
         if entry[0] == class_name and entry[2] == str(comp_date):
             diff = float(entry[1]) - float(comp_grade)
             if diff < 0:
                 return '+' + str(diff)
             else:
                 return diff
     return 0.0
Exemplo n.º 13
0
def tfidf_encoder(filename,columns,max_df,min_df,max_features):
	df = utils.read_csv('essays.csv')
	data = pd.DataFrame(df.projectid)
	dimensions = {}
	for c in columns :
		texts = df[c].fillna('')
		model  = TfidfVectorizer(max_df=max_df,min_df=min_df,max_features=max_features)
		model.fit(texts)
		vector = model.transform(texts)
		c_name = '%s:tfidf_%s'%(filename,c)
		data[c_name] = map(lambda x:{ k:x[0,k] for k in x.nonzero()[1] },vector)
		dimensions[c_name] = vector.shape[1]
	return data,dimensions
Exemplo n.º 14
0
def tfidf_encoder_001(filename,columns,max_df,min_df,max_features):
	'''
		make tfidf vectors and state for gbdt
		return id => { num_words,sum_tfidf } for every column
	'''
	df = utils.read_csv(filename)
	data = pd.DataFrame(df.projectid)
	dimensions = {}
	for c in columns :
		texts = df[c].fillna('')
		model  = TfidfVectorizer(max_df=max_df,min_df=min_df,max_features=max_features)
		model.fit(texts)
		vector = model.transform(texts)
		stats = [ '%s@%s'%(s,c) for s in ['#words','sum_tfidf'] ]
		data[stats[0]] = [ vector[i].nonzero()[0].shape[0] for i in range(vector.shape[0]) ]
		data[stats[1]] = vector.sum(1)
		dimensions.update({ s:1 for s in stats })
	return data,dimensions
Exemplo n.º 15
0
def load_filmmetadata(f):
    res= {}
    #years=set()
    years={}
    types=set()
    for line in utils.read_csv(f,' +++$+++ ') :
        filmid=line[0]
        res[filmid]={'title':line[1],'year':line[2]}
        if line[2] not in years:
            years[line[2]]=1
        else :
            years[line[2]]=years[line[2]]+1
        # get film types
        filmtypes = utils.parse_json_array(line[5])
        for t in filmtypes : types.add(t)
    #print(years)
    print(types)
    return res
Exemplo n.º 16
0
def read_csv(filepath):
    
    '''
    TODO: This function needs to be completed.
    Read the events.csv, mortality_events.csv and event_feature_map.csv files into events, mortality and feature_map.
    
    Return events, mortality and feature_map
    '''

    #Columns in events.csv - patient_id,event_id,event_description,timestamp,value
    events = ''
    
    #Columns in mortality_event.csv - patient_id,timestamp,label
    mortality = ''

    #Columns in event_feature_map.csv - idx,event_id
    feature_map = ''
    
    events, mortality, feature_map=utils.read_csv(filepath)
    

    return events, mortality, feature_map
Exemplo n.º 17
0
        "-image_field",
        type=str,
        default="Input.Image_url",
        help="(Only used if compute_distances = True) Name of CSV field containing image URL",
    )
    parser.add_argument(
        "-draw_events_field",
        type=str,
        default="Answer.WritingTexts",
        help="(Only used if compute_distances = True) Name of CSV field containing drawing task events",
    )

    args = parser.parse_args()
    input_file = args.csv
    data_with_distance = []
    file_header = []
    if args.compute_distances:
        if not args.image_dir:
            parser.print_help()
            raise ValueError("-image_dir parameter is required if -compute_distances is true.")
        generator = BitmapMaker(args.bitmap_dim, args.bitmap_dim)
        image_url_field = args.image_field
        actions_field = args.draw_events_field
        data_with_distance, file_header = evaluate.get_hamming_distances(
            args.csv, args.image_dir, args.output, generator, image_url_field, actions_field
        )
    else:
        data_with_distance, file_header = utils.read_csv(input_file)

    filter(data_with_distance, file_header, args.cutoff, args.output)
def main():
    fields, rows = read_csv('portals.csv', has_header=True)
    country_stats = gather_country_stats(asyncio.get_event_loop(), rows)
    create_licenses_csv(tuple((c, d, l) for c, _, d, l in country_stats))
    update_portals_csv(fields, rows, {c: t for c, t, *_ in country_stats})
Exemplo n.º 19
0
def aff2city(aff):
    city_df = read_csv(DATA_PATH, 'city_name.csv')
    city_names = [city.lower() for city in city_df['City'].values]
    for i in range(len(city_names)):
        if city_names[i] in aff.lower() and city_df['level'][i] == 1:
            return city_names[i]
Exemplo n.º 20
0
# from spacySim import spacySim, spacyPhraseSim
from gloveSim import gloveSim

with open('theme_dict_set.p') as f:
    theme_dict_set = pickle.load(f)

with open('prop_dict_set.p') as f:
    prop_dict_set = pickle.load(f)

app = Flask(__name__)

# --- Init vocab (TODO: really hacky, fix this later)
# --- Assume 1 word per line csv
topics = {}
for k,v in TOPICS.iteritems():
    topics[k] = [(w[0], w[1].lower()) for w in read_csv(v)]

# --- This is also hacky
TOPIC_STAT = {
    'weddingTheme': {
        'sim_mean': 0.111250152359,
        'sim_sd': 0.123640928544
    },
    'weddingProp': {
        'sim_mean': 0.141468741736,
        'sim_sd': 0.129488421166
    }

}

Exemplo n.º 21
0
def load_atomic2020(args):
    random.seed(args.random_seed)

    atomic2020_v1_file = args.data_folder + "atomic_original_tuples.tsv"
    atomic2020_addl_file = args.data_folder + "atomic_additional_tuples.tsv"
    atomic2020_cn_file = args.data_folder + "atomic_conceptnet_tuples.tsv"

    v1_data = read_csv(atomic2020_v1_file, delimiter="\t", skip_header=True)
    addl_data = read_csv(atomic2020_addl_file, delimiter="\t", skip_header=True)
    cn_data_with_id = read_csv(atomic2020_cn_file, delimiter="\t", skip_header=True)
    cn_data = [l[1:] for l in cn_data_with_id]

    # Atomic split
    atomic_train = read_csv(args.atomic_split + "train.tsv", delimiter="\t", skip_header=False)
    atomic_dev = read_csv(args.atomic_split + "dev.tsv", delimiter="\t", skip_header=False)
    atomic_test = read_csv(args.atomic_split + "test.tsv", delimiter="\t", skip_header=False)
    atomic_train_events = get_head_set(atomic_train)
    atomic_dev_events = get_head_set(atomic_dev)
    atomic_test_events = get_head_set(atomic_test)
    v1_data_train = [l for l in v1_data if l[0] in atomic_train_events]
    v1_data_dev = [l for l in v1_data if l[0] in atomic_dev_events]
    v1_data_test = [l for l in v1_data if l[0] in atomic_test_events]
    assert len(v1_data) == len(v1_data_train) + len(v1_data_dev) + len(v1_data_test)

    # CN split
    cn_train = read_csv(args.conceptnet_split + "train.tsv", delimiter="\t", skip_header=False)
    cn_dev = read_csv(args.conceptnet_split + "dev.tsv", delimiter="\t", skip_header=False)
    cn_test = read_csv(args.conceptnet_split + "test.tsv", delimiter="\t", skip_header=False)
    cn_train_heads = get_head_set(cn_train)
    cn_dev_heads = get_head_set(cn_dev)
    cn_test_heads = get_head_set(cn_test)

    cn_data_train = [l for l in cn_data if l[0] in cn_train_heads]
    cn_data_dev = [l for l in cn_data if l[0] in cn_dev_heads]
    cn_data_test = [l for l in cn_data if l[0] in cn_test_heads]

    # Additional tuples split
    (addl_train, addl_dev, addl_test) = head_based_split(addl_data,
                                                         dev_size=args.dev_size,
                                                         test_size=args.test_size,
                                                         head_size_threshold=args.head_size_threshold,
                                                         dev_heads=atomic_dev_events,
                                                         test_heads=atomic_test_events)

    new_addl_train = []
    new_addl_dev = []
    new_addl_test = addl_test
    for l in addl_train:
        h = l[0]
        if h in cn_dev_heads:
            new_addl_dev.append(l)
        else:
            if h in cn_test_heads:
                new_addl_test.append(l)
            else:
                new_addl_train.append(l)

    for l in addl_dev:
        h = l[0]
        if h in cn_test_heads:
            new_addl_test.append(l)
        else:
            new_addl_dev.append(l)

    train = v1_data_train + cn_data_train + new_addl_train
    dev = v1_data_dev + cn_data_dev + new_addl_dev
    test = v1_data_test + cn_data_test + new_addl_test

    return train, dev, test
Exemplo n.º 22
0
Arquivo: main.py Projeto: Ealdor/pypbp
	def start(self):
		self.status.set("Estado: -")
		self.leng.set("Iteración: -/- y Número: -/-")
		self.totaltime.set("Tiempo total: -")
		self.ones.set("Total de unos: -")
		self.types.set("Progreso: -")
		self.startButton.config(state = 'disabled')
		self.browseButton.config(state = 'disabled')
		self.cancelButton.config(state = 'normal')
		self.maxnumberSpinbox.config(state = 'disabled')
		self.complexSpinbox.config(state = 'disabled')
		if int(self.complexSpinbox.get()) in (1,2,3,4,5) and int(self.maxnumberSpinbox.get()) in (1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21) and self.completeName != "":
			start_time = time.time()
			if self.name.get().split('.')[1] == 'csv':
				self.g = generator.Generator(self.maxnumberSpinbox.get(), self.complexSpinbox.get(), utils.read_csv(self.completeName), self.cancelButton, self.types)
			else:
				self.g = generator.Generator(self.maxnumberSpinbox.get(), self.complexSpinbox.get(), utils.read_json(self.completeName), self.cancelButton, self.types)
			self.g.count_one()
			self.ones.set("Total de unos: {0}".format(len(self.g.table_uno)))
			i = 0
			while self.g.maxim > 1:
				i += 1
				self.leng.set("Iteración: {0}/{1} y Número: {2}".format(i, self.complexSpinbox.get(), self.g.maxim))
				self.status.set("Estado: Generando puzzle...")
				self.g.step_one()
				tim = utils.sec_to(int(time.time() - start_time))
				self.totaltime.set("Tiempo total: {0}h:{1}m:{2}s".format(tim[0], tim[1], tim[2]))
				self.status.set("Estado: Aplicando condición uno...")
				self.g.cond_dos(1)
				tim = utils.sec_to(int(time.time() - start_time))
				self.totaltime.set("Tiempo total: {0}h:{1}m:{2}s".format(tim[0], tim[1], tim[2]))
				self.status.set("Estado: Aplicando condición dos...")
				self.g.cond_dos(2)
				tim = utils.sec_to(int(time.time() - start_time))
				self.totaltime.set("Tiempo total: {0}h:{1}m:{2}s".format(tim[0], tim[1], tim[2]))
				
				if self.g.maxim >= 4:
					self.status.set("Estado: Aplicando condición tres...")
					self.g.cond_dos(3)
					tim = utils.sec_to(int(time.time() - start_time))
					self.totaltime.set("Tiempo total: {0}h:{1}m:{2}s".format(tim[0], tim[1], tim[2]))
				
				self.g.count_one()
				self.ones.set("Total de unos: {0}".format(len(self.g.table_uno)))
				if i == self.g.iters:
					self.g.maxim -= 1
					i = 0
			if self.name.get().split('.')[1] == 'csv':
				utils.write_csv(self.g.table_all)
			else:
				utils.write_json(self.g.table_all)

			if self.g.cancel:
				self.status.set("Estado: Cancelado")
			else:
				self.status.set("Estado: Completado")
			self.g = None
		self.startButton.config(state = 'normal')
		self.browseButton.config(state = 'normal')
		self.cancelButton.config(state = 'disabled')
		self.maxnumberSpinbox.config(state = 'normal')
		self.complexSpinbox.config(state = 'normal')
import utils
import sys


'''
 This file accepts command line argument. Processes the contents in the input and generates the report in a csv format
 
 Command Line Arguments  
 Input Path : String : Path to the input file
 Output Path : String : Path to which the report has to be written
 
'''
if len(sys.argv) != 3:
    raise ValueError("Please enter input file and output file to be created as arguments")

input_path = sys.argv[1]
output_path = sys.argv[2]

data_dict = utils.read_csv(input_path)
data_dict = dict(sorted(data_dict.items(), key=lambda x: x[0]))
utils.to_csv(data_dict, output_path)
Exemplo n.º 24
0
def process(wav_file, outfile, csv_file=None, bpm=None, tol=0.35,
            ssm_read_pk=False, read_pk=False, rho=2, is_ismir=False,
            tonnetz=False, sonify=False):
    """Main process to find the patterns in a polyphonic audio file.

    Parameters
    ----------
    wav_file : str
        Path to the wav file to be analyzed.
    csv_file : str
        Path to the csv containing the midi_score of the input audio file
        (needed to produce a result that can be read for JKU dataset).
    outfile : str
        Path to file to save the results.
    bpm : int
        Beats per minute of the piece. If None, bpms are read from the JKU.
    tol : float
        Tolerance to find the segments in the SSM.
    ssm_read_pk : bool
        Whether to read the SSM from a pickle file.
    read_pk : bool
        Whether to read the segments from a pickle file.
    rho : int
        Positive integer to compute the score of the segments.
    is_ismir : bool
        Produce the plots that appear on the ISMIR paper.
    tonnetz : bool
        Whether to use Tonnetz or Chromas.
    sonify : bool
        Whether to sonify the patterns or not.
    """

    # Get the correct bpm if needed
    if bpm is None:
        bpm = get_bpm(wav_file)

    # Algorithm parameters
    min_notes = 8
    max_diff_notes = 4
    h = bpm / 60. / 8.  # Hop size /8 works better than /4, but it takes longer
                        # to process

    # Obtain the Self Similarity Matrix
    X = compute_ssm(wav_file, h, ssm_read_pk, is_ismir, tonnetz)

    # Read CSV file
    if csv_file is not None:
        logging.info("Reading the CSV file for MIDI pitches...")
        midi_score = utils.read_csv(csv_file)

    patterns = []
    csv_patterns = []
    while patterns == [] or csv_patterns == []:
        # Find the segments inside the self similarity matrix
        logging.info("Finding segments in the self-similarity matrix...")
        max_diff = int(max_diff_notes / float(h))
        min_dur = int(np.ceil(min_notes / float(h)))
        #print min_dur, min_notes, h, max_diff
        if not read_pk:
            segments = []
            while segments == []:
                logging.info("\ttrying tolerance %.2f" % tol)
                segments = utils.find_segments(X, min_dur, th=tol, rho=rho)
                tol -= 0.05
            utils.write_cPickle(wav_file + "-audio.pk", segments)
        else:
            segments = utils.read_cPickle(wav_file + "-audio.pk")

        # Obtain the patterns from the segments and split them if needed
        logging.info("Obtaining the patterns from the segments...")
        patterns = obtain_patterns(segments, max_diff)

        # Decrease tolerance in case we couldn't find any patterns
        tol -= 0.05

        # Get the csv patterns if they exist
        if csv_file is not None:
            csv_patterns = patterns_to_csv(patterns, midi_score, h)
        else:
            csv_patterns = [0]

    # Sonify patterns if needed
    if sonify:
        logging.info("Sonifying Patterns...")

        utils.sonify_patterns(wav_file, patterns, h)

    # Formatting csv patterns and save results
    if csv_file is not None:
        logging.info("Writting results into %s" % outfile)
        utils.save_results(csv_patterns, outfile=outfile)
    else:
        # If not csv, just print the results on the screen
        print_patterns(patterns, h)

    if is_ismir:
        ismir.plot_segments(X, segments)

    # Alright, we're done :D
    logging.info("Algorithm finished.")
Exemplo n.º 25
0
import os.path
import numpy as np
from classic_algorithms import FirstFit
from classic_algorithms import NextFit
from classic_algorithms import BestFit
from environment import BinEnvironment
from environment import ItemProvider
from agent import Agent
from utils import plot_learning
from utils import read_csv

max_simultaneously_bins = 5
load_checkpoint = True
filename = "flat"
data = read_csv(file="training_data/%s.csv" % filename)
print(data)

agent = Agent(gamma=0.99,
              epsilon=1.0,
              batch_size=64,
              n_actions=4,
              eps_end=0.01,
              input_dims=[2],
              lr=0.001)
agent.train()
if load_checkpoint and os.path.isfile("%s_checkpoint.pth.tar" % filename):
    agent.load_checkpoint(source_file="%s_checkpoint.pth.tar" % filename)

item_provider = ItemProvider(sample_size=100, data=data, randomize=True)
env = BinEnvironment(max_simultaneously_bins, item_provider=item_provider)
Exemplo n.º 26
0
        'Prénom du candidat', 'N° tour', 'Nom de la commune',
        'N° de dépôt de la liste', 'Prénom du candidat tête de liste',
        u"Date de l'export", 'Libellé du département',
        'Libellé de la commune', '% Abs/Ins', '% Vot/Ins', '% BlNuls/Ins',
        '% BlNuls/Vot', '% Exp/Ins', '% Exp/Vot', 'Sexe', 'Prénom',
        'Liste', 'Sieges', u'% Voix/Ins', u'% Voix/Exp', 'Abstentions',
        'Blancs et nuls']

#FIXME: si les données disponibles étaient plus uniformes, on pourrait
# automatiser davantage tout ceci.

# Export the data as several dataframes:
with pd.HDFStore(os.path.join(var.path['out'], 'elections.h5'), 'w') as store:
    # European elections, 2014:
    fname = get("http://www.regardscitoyens.org/telechargement/donnees/elections/2014_europeennes/europ%C3%A9ennes-2014-r%C3%A9sultats-bureaux_vote-tour1.csv")
    store.put('europeennes2014', fmt_elections(read_csv(fname[0], cols_mapper,
        cols_to_drop, dtype_mapper)), format='fixed')

    # Municipal elections, second turn, 2014:
    fname = get("http://www.regardscitoyens.org/telechargement/donnees/elections/2014_municipales/municipales-2014-r%C3%A9sultats-bureaux_vote-tour2.csv")
    store.put('municipales2014_tour2', fmt_elections(read_csv(fname[0],
        cols_mapper, cols_to_drop, dtype_mapper)), format='fixed')

    # Municipal elections, first turn, 2014:
    fname = get("http://www.regardscitoyens.org/telechargement/donnees/elections/2014_municipales/municipales-2014-r%C3%A9sultats-bureaux_vote-tour1.csv")
    store.put('municipales2014_tour1', fmt_elections(read_csv(fname[0],
        cols_mapper, cols_to_drop, dtype_mapper)), format='fixed')

    # Legislative elections, 2012:
    fname = get("http://www.nosdonnees.fr/storage/f/2013-03-05T184148/LG12_BV_T1T2.zip")
    store.put('legislatives2012_tour2', fmt_elections(read_csv(fname[1],
        cols_mapper, cols_to_drop, dtype_mapper, encoding='cp1252')),
Exemplo n.º 27
0
    if os.path.exists(MEDIANS_PLOT_PATH):
        shutil.rmtree(MEDIANS_PLOT_PATH)

    for index, ecg_file in enumerate(median_files):
        print(f'Generating plots: { index + 1 } / { len(median_files) }',
              end='\r')

        ecg_id = os.path.splitext(ecg_file)[0]

        file_path = os.path.join(MEDIANS_PLOT_PATH,
                                 f'{ ecg_id }.{ PLOT_FORMAT }')

        # Read ECG data from .asc file
        ecg_data = read_csv(os.path.join(MEDIANS_PATH, ecg_file),
                            delimiter=' ',
                            transpose=True,
                            skip_header=False,
                            dtype=np.int)

        # Normalize ECG data between -1 and 1
        ecg_normalized = normalize(ecg_data)

        # Scale normlaized ECG data between -Y_MAX and Y_MAX
        ecg_scaled = np.array(ecg_normalized * Y_MAX, dtype=int)

        # Reduce the length of ECG data by dropping every other element
        ecg_reduced = shorten(ecg_scaled)

        # Remove the first n values to fit within the X_MAX limit
        if len(ecg_reduced[0]) > X_MAX:
            ecg_reduced = [
Exemplo n.º 28
0
 def mining(self):
     """Mining frequent sequences by prefixSpan"""
     S = utils.read_csv(self.datafile)
     self.patternSet = utils.prefixSpan(S, Sequence([], sys.maxint), self.support)
Exemplo n.º 29
0
    parser.add_argument("-output_json_file", type=str, required=True, help="Output json file")
    
    parser.add_argument("-rseed", type=int, default=0, help="Random seed")
    parser.add_argument("-train_ratio", type=float, default=0.67, help="Ratio (between 0 and 1) of examples to use for training")
    parser.add_argument("-test_ratio", type=float, default=0.17, help="Ratio (between 0 and 1) of examples to use for testing")
    parser.add_argument("-val_ratio", type=float, default=0.16, help="Ratio (between 0 and 1) of examples to use for validation")
    parser.add_argument("-commands_field", type=str, default="Input.commands", help="Name of CSV field containing descriptions to arrange blocks")
    parser.add_argument("-image_field", type=str, default="Input.Image_url", help="Name of CSV field containing image URL")
    parser.add_argument("-retrieve_text_bitmaps", type=bool, default=True, help="Retrieve and augment data with bitmaps located adjacent to image URL's")
    parser.add_argument("-draw_events_field", type=str, default="Answer.WritingTexts", help="Name of CSV field containing drawing task events")

    args = parser.parse_args()

    random.seed(args.rseed)

    header, rows = read_csv(args.csv)

    commands_idx = header.index(args.commands_field)
    actions_idx = header.index(args.draw_events_field)
    image_idx = header.index(args.image_field)

    key_attrs = split_keys(rows, image_idx, args.train_ratio, args.test_ratio, args.val_ratio)
    output_file = args.output_json_file

    d = os.path.dirname(output_file)
    if not os.path.exists(d) and len(d)>0:
        os.makedirs(d)

    print "Processing {} rows".format(len(rows))
    write_data(rows, key_attrs, image_idx, commands_idx, actions_idx, output_file, include_bitmaps=args.retrieve_text_bitmaps)
Exemplo n.º 30
0
from glob import glob
from nltk.tokenize import word_tokenize, sent_tokenize
import re
from utils import read_file, get_name_from_filepath, read_csv
import pandas as pd
from textstat import textstat
#nltk.download('punkt')

litigious_words = read_csv(r'C:\Users\Krista\hansell\lm_litigious.csv')
files = glob(
    r'C:\Users\Krista\DocumentsRE _Call_re_potential_matter\out\\*.txt')

info_dicts = []
for file in files:
    info_dict = {}
    name = get_name_from_filepath(file)
    text = read_file(file)
    words = word_tokenize(text)
    words_in_list = [
        word for word in words
        if any(lit in word for lit in litigious_words.index)
    ]

    info_dict['Name'] = name
    info_dict['Total litigious words'] = len(words_in_list)
    info_dict['Total words'] = len(words)

    info_dicts.append(info_dict)

df = pd.DataFrame(info_dicts)
df.to_csv(