def handle_noargs(self, **options):
        global YEAR, COMMIT
        YEAR = options['year']
        COMMIT = options['commit']

        if not options['candidates'] or not os.path.exists(options['candidates']):
            print >> sys.stderr, "The candidates file doesn't exist"
            sys.exit(1)
        if not YEAR:
            print >> sys.stderr, "You must specify a year"
            sys.exit(1)

        #check all the parties exist
        with open(options['candidates'], 'rb') as csvfile:
            candidiates = unicodecsv.reader(csvfile)
            missingparties = False
            lastmissingparty = ''
            for row in candidiates:
                if not get_party(row[0]):
                    if row[0] != lastmissingparty:
                        print 'Missing party:', row[0]
                        lastmissingparty = row[0]
                    missingparties = True
            if missingparties:
                sys.exit(1)

        #check whether the positions exist, otherwise create them
        check_or_create_positions()

        with open(options['candidates'], 'rb') as csvfile:
            candidiates = unicodecsv.reader(csvfile)
            for row in candidiates:
                if not search(row[3], row[4], row[0], row[2], row[1]):
                    add_new_person(row[0], row[2], row[1], row[3], row[4])
예제 #2
0
파일: csv.py 프로젝트: kdodia/blaze
    def _get_py(self, key):
        if isinstance(key, tuple):
            assert len(key) == 2
            result = self._get_py(key[0])

            if isinstance(key[1], list):
                getter = itemgetter(*key[1])
            else:
                getter = itemgetter(key[1])

            if isinstance(key[0], (list, slice)):
                return map(getter, result)
            else:
                return getter(result)

        f = self.open(self.path)
        if self.header:
            next(f)
        if isinstance(key, compatibility._inttypes):
            line = nth(key, f)
            result = next(csv.reader([line], **self.dialect))
        elif isinstance(key, list):
            lines = nth_list(key, f)
            result = csv.reader(lines, **self.dialect)
        elif isinstance(key, slice):
            start, stop, step = key.start, key.stop, key.step
            result = csv.reader(it.islice(f, start, stop, step), **self.dialect)
        else:
            raise IndexError("key '%r' is not valid" % key)
        try:
            if not isinstance(result, Iterator):
                f.close()
        except AttributeError:
            pass
        return result
예제 #3
0
def open_file(infile, informat='raw', encoding="utf-8", **kwargs):
    logger.debug('Opening file: {}'.format(infile))
    if isinstance(infile, basestring):
        if informat == "vnd.ms-excel" or informat == 'xls':
            import xlrd
            logger.debug('An office file!')
            f = xlrd.open_workbook(infile, on_demand=True)
        elif informat == "xml":
            logger.debug('An XML file!')
            f = etree.parse(infile)
        elif informat == "csv":
            logger.debug('Opening as csv')
            f = csv.reader(open(infile, 'r'),
                           encoding=encoding,
                           **kwargs)
        else:
            f = codecs.open(infile, 'r', encoding)
    else:
        if informat == "vnd.ms-excel" or informat == 'xls':
            import xlrd
            logger.debug('An office file!')
            f = xlrd.open_workbook(file_contents=infile.read(), on_demand=True)
        elif informat == "xml":
            logger.debug('An XML file!')
            f = etree.fromstring(infile)
        elif informat == "csv":
            logger.debug("CSV file")
            f = csv.reader(infile, encoding=encoding, **kwargs)
        else:
            f = codecs.iterdecode(iter(infile.readline, ""), encoding)
    return f
예제 #4
0
    def __init__(self,file=None):

        filesniff = open(file)
        try:
            dialect = unicodecsv.Sniffer().sniff(filesniff.read(1024))
            wb = unicodecsv.reader(open(file),dialect,encoding='utf-8')
        except Exception:
            wb = unicodecsv.reader(open(file),delimiter=',',encoding='utf-8')

        self.wb = wb

        reader = wb
        rows = []
        columns = []
        #
        # for rownum in range(sh1.nrows): # sh1.nrows -> number of rows (ncols -> num columns)
        #     rows.append(sh1.row_values(rownum))

        for row in reader:
            rows.append(row)

        print rows

        columns = self.columnsExtract(rows)


        print columns

        res = Generator().main(rows=rows,columns=columns)

        self.res = res
예제 #5
0
파일: barChart.py 프로젝트: DanyaLagos/CFSS
def getBarChartData(): #First function to get Bar Chart Data
    f_artists = open('artists.csv') #Opens artists.csv for editing and makes a variable out of it.
    f_albums = open('albums.csv') #Opens albums.csv for editing and makes a variable out of it.

    artists_rows = csv.reader(f_artists) #Creates rows in artists.csv
    albums_rows = csv.reader(f_albums) #Creates rows in albums.csv

    artists_header = artists_rows.next() #Creates header in artists.csv
    albums_header = albums_rows.next() #Creates header in albums.csv

    artist_names = [] # New list in which to store artist names.
    
    decades = range(1900,2020, 10) #For upcoming decade dictionary, sets limits and intervals for how it will store dates by decade.
    decade_dict = {} #Creates decae dictionary.
    for decade in decades: #Sets conditions on what happens to this dictionary.
        decade_dict[decade] = 0 #Sets initial value to 0
    
    for artist_row in artists_rows: #Sets conditions on rows in artists.csv
        if not artist_row: #condition: skip if it does not correspond to a row in artists.csv
            continue
        artist_id,name,followers, popularity = artist_row # set following values to go in entries to rows section of arists.csv
        artist_names.append(name) #puts all artist names in the list

    for album_row  in albums_rows: #sets conditions on rows in albums.csv
        if not album_row: #condition: skip if it does not correspond to a row in albums.csv
            continue
        artist_id, album_id, album_name, year, popularity = album_row #sets following values to go in entries to rows section of albums.csv
        for decade in decades: #conditions on decades it assigns to album
            if (int(year) >= int(decade)) and (int(year) < (int(decade) + 10)): #takes year in album and counts the decade in which the year belongs
                decade_dict[decade] += 1
                break #ends the condition on the album row

    x_values = decades #for upcoming chart, sets decades as the x value
    y_values = [decade_dict[d] for d in decades] #for upcoming chart, sets number of albums as y value
    return x_values, y_values, artist_names #brings all of the values generated in the function for the upcoming barchart function
def processData():
    global manualIgnoreRecords
    global yesIgnoreRecords
    global manualProcessedRecords		
    global yesProcessedRecords		
    	
    dirpath = parentdir + "/R3_profiles_YNNM_raw/" 
    with open(dirpath + 'MANUAL_RAW.csv', 'r') as infile, open(processeddir + 'MANUAL_PROCESSED.csv', 'ab') as outfile:
	rows = unicodecsv.reader(infile, delimiter=';', encoding='utf-8')
	writer = unicodecsv.writer(outfile, delimiter=';', encoding='utf-8')
	for row in rows:
	    if(row[6] in manual_ignore_list): #Ignore it
		manualIgnoreRecords += 1
		continue
	    else:
		manualProcessedRecords += 1
		writer.writerow(row)

    with open(dirpath + 'YES_RAW.csv', 'r') as infile, open(processeddir + 'YES_PROCESSED.csv', 'ab') as outfile:
	rows = unicodecsv.reader(infile, delimiter=';', encoding='utf-8')
	writer = unicodecsv.writer(outfile, delimiter=';', encoding='utf-8')	
	for row in rows:
	    if(row[6] in yes_ignore_list): #Ignore it	
		yesIgnoreRecords += 1
		continue
	    else:
		yesProcessedRecords
		writer.writerow(row)
    def _get_headers(self, resource):
        """
        Get CSV file headers from the provided resource.
        """

        # If the resource is a file we just open it up with the csv
        # reader (after being sure we're reading from the beginning
        # of the file
        if type(resource) == file:
            resource.seek(0)
            reader = csv.reader(resource)
        # If the resource is a basestring it is either a url or a file
        # location, so similarly to the specification mechanism we either
        # access it with an HTTP get request or by opening the file.
        elif isinstance(resource, basestring):
            result = six.moves.urllib.parse.urlparse(resource)
            if result.scheme in ['http', 'https']:
                with closing(requests.get(resource, stream=True)) as response:
                    # Headers are alway the first row of a CSV file
                    # so it's enought to just get the first line and
                    # hopefully save bandwidth
                    header_row = response.iter_lines().next()
            else:
                # It may seem weird to open up a csv file, read its header row
                # and then StringIO that into a new csv reader but this file
                # we want to close and we want the same interface for all
                with open(resource) as resource_file:
                    reader = csv.reader(resource_file)
                    header_row = reader.next()

            reader = csv.reader(cStringIO.StringIO(header_row))
        else:
            raise IOError('Resource type not supported')
        return reader.next()    
예제 #8
0
파일: sto_fc.py 프로젝트: strongy/sto-data
def output_lfc_diff_using_csv(path_to_csv_earlier, path_to_csv_later, output_path):
    fleet_time_a = {}
    fleet_time_b = {}
    fleet_diff = {}
    with open(path_to_csv_earlier, 'rUb') as csvfile:
        reader = csv.reader(csvfile, errors='ignore')
        for row in reader:
            if len(row) >= 2:
                fleet_time_a[row[0].strip()] = long(row[1].strip())
    with open(path_to_csv_later, 'rUb') as csvfile:
        reader = csv.reader(csvfile, errors='ignore')
        for row in reader:
            if len(row) >= 2:
                fleet_time_b[row[0].strip()] = long(row[1].strip())
    for account_name in fleet_time_b.keys():
        if account_name in fleet_time_a:
            lfc_diff = fleet_time_b[account_name] - fleet_time_a[account_name]
        else:
            lfc_diff = fleet_time_b[account_name]
        if lfc_diff > 0:
            fleet_diff[account_name] = lfc_diff
    fleet_diff_output = fleet_diff.items()
    fleet_diff_output.sort(key=lambda account_tuple: account_tuple[1], reverse=True)
    with open(output_path, 'wb') as csvfile:
        cwriter = csv.writer(csvfile)
        for account_tuple in fleet_diff_output:
            cwriter.writerow(account_tuple)
예제 #9
0
def validate_and_return_rows(csv_file_form=None, csv_file=None, has_header_row=True, required_fields=[]):
	"""
		Opens a CSV file and optionally checks for required fields in the header. Returns
		the rows of the CSV and a list of the headers as a tuple: (csv_rows, headers)
	"""
	if csv_file_form:
		csv_file = csv_file_form.cleaned_data['file']
		has_header_row = csv_file_form.cleaned_data['has_header_row']

	if not csv_file:
		raise Exception("Pass in CsvFileForm instance or csv_file=request.FILES.get('file')")

	column_headers = []
	if has_header_row:
		# Read and store the header row column names
		r = unicodecsv.reader(csv_file.read().splitlines(), encoding='utf-8')
		row = r.next()
		for column_header in row:
			column_headers.append(column_header)

		# Check for required fields (optional check)
		for field in required_fields:
			if field not in column_headers:
				raise Exception("Invalid CSV file. Must contain %s." % field)

	csv_file.seek(0)
	return (unicodecsv.reader(csv_file.read().splitlines(), encoding='utf-8'), column_headers)
예제 #10
0
파일: patch.py 프로젝트: paulfitz/catsql
def patchsql(sys_args):

    parser = argparse.ArgumentParser(description='Patch a database.')

    parser.add_argument('url', help='Sqlalchemy-compatible database url')

    parser.add_argument('--patch', nargs=1, required=False, default=None,
                        help="A csv file describing the patch. In the "
                        "format output by daff.")

    parser.add_argument('--follow', nargs=2, required=False, default=None,
                        help="An alternative to --patch option.  Specify"
                        "two csv files to compare, and patch from their diff.")

    parser.add_argument('--table', nargs=1, required=True, default=None,
                        help='Table to which patch should be applied.')

    parser.add_argument('--safe-null', required=False, action='store_true',
                        help='Decode nulls in a reversible way.')

    parser.add_argument('--quiet', required=False, action='store_true',
                       help='Do not show computed diff.')

    args = parser.parse_args(sys_args)

    url = args.url
    tables = args.table

    db = SqlAlchemyDatabase(url)
    st = daff.SqlTable(db, daff.SqlTableName(tables[0]))

    patch = None

    if args.patch:
        with open(args.patch[0], 'rt') as fin:
            patch = list(csv.reader(fin))
            patch = daff.Coopy.tablify(patch)

    if args.follow:
        with open(args.follow[0], 'rt') as fin:
            table0 = list(csv.reader(fin))
            fix_nulls(table0, args.safe_null)
        with open(args.follow[1], 'rt') as fin:
            table1 = list(csv.reader(fin))
            fix_nulls(table1, args.safe_null)
        patch = daff.Coopy.diff(table0, table1)
        ansi_patch = daff.Coopy.diffAsAnsi(table0, table1)
        if not args.quiet:
            print(ansi_patch, file=sys.stderr, end='')

    if not patch:
        raise KeyError('please specify either --patch or --follow')

    daff_patch = daff.HighlightPatch(st, patch)
    daff_patch.apply()
    if db.events['skips'] != 0:
        print(" * {}".format(json.dumps(db.events),
                             file=sys.stderr))
    def compare_csvs(self, csv1, csv2):
        """Compara dos csvs a ver si son iguales."""
        with open(csv1, "rb") as csvfile1, open(csv2, "rb") as csvfile2:
            reader1 = unicodecsv.reader(csvfile1, delimiter=str(","), quotechar=str('"'))
            reader2 = unicodecsv.reader(csvfile2, delimiter=str(","), quotechar=str('"'))
            rows1 = set((",".join(row) for row in reader1))
            rows2 = set((",".join(row) for row in reader2))

            self.assertEqual(len(rows1), len(rows2))
            self.assertEqual(rows1, rows2)
예제 #12
0
def main(args):

    finished = defaultdict(int)
    input_lines = []
    skipped = defaultdict(int)
    written = defaultdict(int)

    # load input data
    with args.input_file as fh:
        csvread = csv.reader(fh, delimiter=str(args.input_csv_delim),
                             quotechar=b'"', encoding="UTF-8")
        columns = DataLine.get_columns_from_header(csvread.next())
        for row in csvread:
            input_lines.append(DataLine.from_csv_line(row, columns))

    # load all results files provided
    for finished_file in args.finished_files:
        with finished_file as fh:
            csvread = csv.reader(fh, delimiter=str(args.finished_csv_delim),
                                 quotechar=b'"', encoding="UTF-8")
            header = csvread.next();
            columns = DataLine.get_columns_from_header(header)
            try:
                judgment_column = header.index('check_result')
            except ValueError:
                judgment_column = None
            for row in csvread:
                # treat rejected as unfinished
                if judgment_column is not None and row[judgment_column].startswith('N'):
                    continue
                # keep track of how many judgments are finished in the results
                finished_line = DataLine.from_csv_line(row, columns)
                finished[finished_line.signature] += 1

    print >> sys.stderr, "Loaded input: %d, Loaded finished: %d" % (len(input_lines), len(finished))

    with sys.stdout as fh:
        # starting with the header
        csvwrite = csv.writer(fh, delimiter=b"\t", lineterminator="\n", encoding="UTF-8")
        csvwrite.writerow(DataLine.get_headers())
        # write rows requiring different number of judgments, starting from the most judgments
        for judg_req in xrange(args.num_judgments, 0, -1):

            csvwrite.writerow(("# Requiring %d judgments" % judg_req,))
            for line in input_lines:
                if finished[line.signature] != args.num_judgments - judg_req:
                    skipped[judg_req] += 1
                    continue
                csvwrite.writerow(line.as_tuple())
                written[judg_req] += 1

            print >> sys.stderr, ("%d judgments -- written: %d" % (judg_req, written[judg_req]))

    print >> sys.stderr, "Skipped: %d" % (len(input_lines) - sum(written.values()))
예제 #13
0
def main():

    #Connect to database file (Note: can also pass as a file using sys)
    CONN = sqlite3.connect('melon.db')

    #This cursor object passes commands from python and executes them in the sqlite3 melon.db
    DB = CONN.cursor()

    #Deletes tables if they exist. Good if I made a mistake creating them.
    DB.execute('''DROP TABLE IF EXISTS Customers;''')
    DB.execute('''DROP TABLE IF EXISTS Orders;''')

    #Create 2 tables of Customers & Orders
    DB.execute('''CREATE TABLE Customers (customer_id INTEGER PRIMARY KEY NOT NULL, first varchar(30), last varchar(30), email varchar(60), telephone varchar(30), called DATE);''')

    #Note: Foreign key & Reference needs to be stated at the end of the create table dialog.
    DB.execute('''CREATE TABLE Orders (order_id INTEGER PRIMARY KEY NOT NULL, order_date DATE,status varchar(30), customer_id INTEGER,email varchar(60),address varchar(30),city varchar(30),state varchar(30),postalcode varchar(30),num_watermelons INTEGER,num_othermelons INTEGER ,subtotal INTEGER ,tax INTEGER ,order_total INTEGER, FOREIGN KEY (customer_id) REFERENCES Customers(customer_id)); ''')


    #CSV reader reads each line and strips, splits each line into a list object. :)

    f1reader = unicodecsv.reader(open('customers.csv'), encoding='utf-8')
    f2reader = unicodecsv.reader(open('orders.csv'), encoding='utf-8')

    
    #Next skips collumn names // headers of csv file

    next(f1reader)
    next(f2reader)

# Text formatting in file (nyee '~' & special charecters) cannot be interpreted by sqlite. Need to import unicode csv module to parse code.

#Note: There are some shity looking customer files with strange charecters. What up with dat? Can they be flagged and edited later on? (eg., like editing student records) 


    for row in f1reader:

        DB.executemany('''INSERT INTO customers (customer_id, first, last, email, telephone, called) VALUES(?, ?, ?, ?, ?, ?);''', (row, ))

    for row2 in f2reader:

        DB.executemany('''INSERT INTO orders (order_id, order_date, status, customer_id, email, address, city, state, postalcode, num_watermelons, num_othermelons, subtotal, tax, order_total) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);''', (row2, ))

        #Change empty string row values to Null.
        # DB.executemany('''UPDATE orders SET VALUES 
        #   ''')



    
    CONN.commit()
    CONN.close()
def read_files(qfile, qcatfile, catfile):
	"""
	read from .csv files
		qfile - .csv file containing the SMS Guru questions
		qcatfile - .csv file containing the relation between questions and category
		catfile - .csv file containing the categories
	"""
	with open(qfile, 'rb') as csvfile:
		question_train = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8'))
	with open(qcatfile, 'rb') as csvfile:
		question_category_train = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8'))
	with open(catfile, 'rb') as csvfile:
		category = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8'))
	return question_train, question_category_train, category
def read_files(qfile_train, qfile_test, catfile):
	"""
	read from .csv files
		qfile_train - .csv file containing the SMS Guru questions for the train set
		qfile_test - .csv file containing the SMS Guru questions for the test set
		catfile - .csv file containing the categories
	"""
	with open(qfile_train, 'rb') as csvfile:
		question_train = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8'))
	with open(qfile_test, 'rb') as csvfile:
		question_test = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8'))
	with open(catfile, 'rb') as csvfile:
		category = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8'))
	return question_train, question_test, category
예제 #16
0
def main(wordlist1, wordlist2, dist_funcs):
    with open(wordlist1, 'rb') as file_a, open(wordlist2, 'rb') as file_b:
        reader_a = csv.reader(file_a, encoding='utf-8')
        reader_b = csv.reader(file_b, encoding='utf-8')
        print('Reading word lists...')
        words = list(zip([(w, g) for (g, w) in reader_a],
                    [(w, g) for (g, w) in reader_b]))
        words_a, words_b = list(zip(*[(a, b) for (a, b) in words if a and b]))
        print('Constructing cost matrix...')
        matrix = construct_cost_matrix(words_a, words_b, dist_funcs)
        m = munkres.Munkres()
        print('Computing matrix using Hungarian Algorithm...')
        indices = m.compute(matrix)
        print(score(indices))
        print('Done.')
예제 #17
0
    def process(self):
        report = self.receive_message()
        
        if report:
            event = Event()

            columns = ["__IGNORE__", "source_url", "description_url", "source_time", "__IGNORE__", "__IGNORE__", "__IGNORE__", "target"]
            
            for row in unicodecsv.reader(StringIO(report), encoding='utf-8'):

                if "phish_id" in row:
                    continue
                
                for key, value in zip(columns, row):

                    if key == "__IGNORE__":
                        continue
                    
                    event.add(key, value.strip())
                
                event.add('feed', 'phishtank')
                event.add('type', 'phishing')

                event = utils.parse_source_time(event, "source_time")
                event = utils.generate_observation_time(event, "observation_time")
                event = utils.generate_reported_fields(event)
                    
                self.send_message(event)
             
        self.acknowledge_message()
예제 #18
0
def writeUniqueResults(clustered_dupes, input_file, output_file):

    # Write our original data back out to a CSV with a new column called 
    # 'Cluster ID' which indicates which records refer to each other.

    logging.info('saving unique results to: %s' % output_file)

    cluster_membership = {}
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        for record_id in cluster:
            cluster_membership[record_id] = cluster_id

    unique_record_id = cluster_id + 1

    writer = csv.writer(output_file)

    reader = csv.reader(StringIO(input_file))

    heading_row = next(reader)
    heading_row.insert(0, u'Cluster ID')
    writer.writerow(heading_row)

    seen_clusters = set()
    for row_id, row in enumerate(reader):
        if row_id in cluster_membership:
            cluster_id = cluster_membership[row_id]
            if cluster_id not in seen_clusters:
                row.insert(0, cluster_id)
                writer.writerow(row)
                seen_clusters.add(cluster_id)
        else:
            cluster_id = unique_record_id
            unique_record_id += 1
            row.insert(0, cluster_id)
            writer.writerow(row)
예제 #19
0
	def from_csv(self,path=None,add=False):

		"""Reads a previously constructed CITES<->NCBI CSV mapping database."""
		
		# open the file and read the .csv
		logging.debug('Going to read csv file "%s"' % path)
		with open(path, 'rb') as csvfile:
			read = csv.reader(csvfile, encoding='utf-8', delimiter=',', quotechar='"')
			for line in read:
				
				# store the date
				if line[0] == u'Date':
					self.date = line[1]
					logging.debug('Stored date: "%s"' % self.date)
				
				# skip comment lines
				elif re.match('^#', line[0]):
					logging.debug('Skipping comment line: %s' % line)
					continue
				
				# ncbi taxid, name, description, ncbi name, CITES appendix
				else:                                
					taxid = line[0]
					name  = line[1]
					desc  = line[2]
					canon = line[3]
					app   = line[4]
					self.taxa.append( Taxon(
						name=name,
						description=desc,
						appendix=app,
						ncbi={taxid:canon}
					) )
					logging.debug('Instantiated "%s" with {%s:%s}' % (name,taxid,canon))
예제 #20
0
def main():

    rnd.seed(1206)

    ap = ArgumentParser()
    # TODO use more files ?
    ap.add_argument('-b', '--bootstrap-iters', type=int, default=1000)
    ap.add_argument('cf_output', type=str, help='crowdflower results file')

    args = ap.parse_args()
    votes = defaultdict(int)
    res = []

    with open(args.cf_output, 'rb') as fh:
        csvread = csv.reader(fh, delimiter=b',', quotechar=b'"', encoding="UTF-8")
        headers = csvread.next()
        for row in csvread:
            row = Result(row, headers)
            if row._golden == 'true':  # skip test questions
                continue
            if row.more_natural == 'A less than B':
                votes[row.origin_b] += 1
                res.append(row.origin_b)
            elif row.more_natural == 'A more than B':
                votes[row.origin_a] += 1
                res.append(row.origin_a)

    for key, val in votes.iteritems():
        print '%s\t%d (%2.2f)' % (key, val, float(val) / len(res) * 100)

    pairwise_bootstrap(res, args.bootstrap_iters)
예제 #21
0
    def test_csv_export(self):
        """Ensures exported CSV data matches source data"""
        qs_filter = { "pk__in": [x.pk for x in self.snapshots] }
        qs = BuildingSnapshot.objects.filter(**qs_filter)

        fields = list(_get_fields_from_queryset(qs))
        fields.append("canonical_building__id")

        export_filename = export_csv(qs, fields)
        export_file = open(export_filename)

        reader = csv.reader(export_file)
        header = reader.next()

        self.assertEqual(header[len(fields)-1], 'ID')

        for i in range(len(self.snapshots)):
            row = reader.next()
            for j in range(len(fields)):
                field = fields[j]
                components = field.split("__")
                qs_val = qs[i]
                for component in components:
                    qs_val = getattr(qs_val, component)
                    if qs_val == None:
                        break
                if isinstance(qs_val, Manager) or qs_val == None: qs_val = u''
                else: qs_val = unicode(qs_val)
                csv_val = row[j]
                self.assertEqual(qs_val, csv_val)

        export_file.close()
        os.remove(export_filename)
예제 #22
0
    def handle_label(self, label, **options):
        verbosity = int(options['verbosity'])

        if not os.path.exists(label):
            print >> sys.stderr, "The parties file doesn't exist",
            sys.exit(1)

        #get the party kind object
        partykind = OrganisationKind.objects.get(slug='party')

        #check each party by checking against slug
        with open(label, 'rb') as csvfile:
            parties = unicodecsv.reader(csvfile)
            for slug, name in parties:
                try:
                    party = Organisation.objects.get(slug=slug)
                    if party.name != name:
                        if verbosity >= 1:
                            print 'Updating party %s from %s to %s' % (slug, party.name, name)
                        party.name = name
                        party.save()
                except Organisation.DoesNotExist:
                    #we need to add the party
                    if verbosity >= 1:
                        print 'Adding party %s' % name
                    Organisation.objects.create(
                        name = name,
                        slug = slug,
                        kind = partykind)
예제 #23
0
def read_db_csvfile( file_name, delim = "|" ):

    # read the file
    db_reader = unicodecsv.reader( open(file_name), delimiter=delim.encode('utf-8'), encoding='cp1252')

    transactions = []
    mc_transactions = []
    for row in db_reader:
        LOGGER.debug("row: %r", row)
        # skip empty rows
        if not row:
            continue
        if row[1].startswith("D\xe9compte\xa0des\xa0d\xe9penses:\xa0carte\xa0de\xa0cr\xe9dit "):
            LOGGER.debug('"Décompte\xa0des\xa0dépenses" line dropped.')
        else:
            transaction, is_mastercard_transaction = _process_db_csv_entry( row )
            if is_mastercard_transaction:
                mc_transactions.append( _clean_transaction(transaction) )
                #LOGGER.debug("mastercard transaction: %r", transaction)
            else:
                transactions.append( _clean_transaction(transaction) )
                #LOGGER.debug("transaction: %r", transaction)
            #pprint(transaction)

    return transactions, mc_transactions
예제 #24
0
def create_fixtures():
    loader = Loader('opennews', project_label='Open News',
                    project_settings={},
                    source_url=DEFAULT_SOURCE_URL)
    
    import_schema(loader.project, StringIO(SCHEMATA))

    reader = unicodecsv.reader(StringIO(DATA))
    reader.next()
     
    for record in reader:
        fellow = loader.make_entity(['fellow'])
        fellow.set('name', record[0])
        fellow.set('twitter_handle', record[1])
        fellow.save()
     
        news_org = loader.make_entity(['news_organization'])
        news_org.set('name', record[4])
        news_org.set('url', record[5])
        news_org.save()
     
        fellowship = loader.make_relation('fellowship', fellow, news_org)
        fellowship.set('start_date', record[2])
        fellowship.set('end_date', record[3])
        fellowship.save()

    loader.persist()
예제 #25
0
파일: collect.py 프로젝트: flother/landnr
def iter_rows(building_data):
    """
    Opens the given file-like object as a CSV file delimited by pipes,
    and yields a list for each row containing the land plot number,
    building id, street address, post code, and latitude, and longitude.
    """
    reader = unicodecsv.reader(building_data, delimiter="|", encoding="latin1")
    reader.next()  # Skip header row.
    for row in reader:
        land_plot_number = int(row[3])
        building_id = int(row[4])
        street_address = u" ".join(row[5].split())  # Normalise whitespace.
        try:
            post_code = int(row[7])
        except ValueError:
            post_code = None
        isn93_x = float(row[22].replace(",", "."))
        isn93_y = float(row[23].replace(",", "."))
        longitude, latitude = isnet93_to_wgs84(isn93_x, isn93_y)
        yield {
            "landnr": land_plot_number,
            "heitinr": building_id,
            "street": street_address,
            "postcode": post_code,
            "ll": (longitude, latitude),
        }
예제 #26
0
def process_casen():
    f = open('casen_2014.csv', 'r')
    r = unicodecsv.reader(f, encoding='utf-8')
    comunas_names = r.next()
    datos_comuna = {}
    for i in range(3, len(comunas_names)):
        comuna = comunas_names[i].upper()
        result = scraperwiki.sqlite.select('id from data where muni="%s"' % (comuna, ))
        id_ = result[0].get('id')
        datos_comuna[i] = {'comuna_id': id_, 'comuna_name': comuna, 'dato': []}
    dato_counter = 0
    for datos in r:
        dato_counter += 1
        for j in range(3, len(comunas_names)):
            datos_comuna[j]['dato'].append({
                'id': dato_counter,
                'dato_name': datos[1].strip(),
                'value': datos[j]
            })
    dato_counter = 0
    for a in datos_comuna:
        for b in datos_comuna[a]['dato']:
            final = {'id': dato_counter,
                     'id_muni': datos_comuna[a]['comuna_id'],
                     'dato_name': b['dato_name'],
                     'value': b['value']
            }
            dato_counter += 1
            scraperwiki.sqlite.save(unique_keys=['id'], data=final, table_name='datos_comuna')
예제 #27
0
    def read_tsv(cls, path, encoding="utf-8"):
        """Read a gene set database from a tab-delimited text file.

        Parameters
        ----------
        path: str
            The path name of the the file.
        encoding: str
            The encoding of the text file.

        Returns
        -------
        None
        """
        gene_sets = []
        n = 0
        with open(path, "rb") as fh:
            reader = csv.reader(fh, dialect="excel-tab", encoding=encoding)
            for l in reader:
                n += 1
                gs = GeneSet.from_list(l)
                gene_sets.append(gs)
        logger.debug("Read %d gene sets.", n)
        logger.debug("Size of gene set list: %d", len(gene_sets))
        return cls(gene_sets)
예제 #28
0
	def from_dump(self,path):

		"""Reads a downloaded CITES database dump."""
		
		# open the file and read the .csv
		logging.info('Reading the CITES data dump.')
		header = {}
		with open(path, 'rb') as csvfile:
			read = csv.reader(csvfile, delimiter=',', quotechar='"', encoding='utf-8')
			for line in read:
				if not header:
					header = line
					continue
				else:
					record = {}
					for idx, val in enumerate(header):
						record[header[idx]] = line[idx]
					if record[u'CitesAccepted'] == u'true':
						app   = { u'I':1, u'II':2, u'III':3 }
						for i in record[u'CurrentListing'].split(u'/'):
							if i in app:
								taxon = Taxon(
									name=record[u'FullName'],
									description=record[u'AnnotationEnglish'],
									appendix=unicode(app[i])
								)
								self.taxa.append(taxon)
예제 #29
0
    def __init__(self,filename,header=None,tabs=False,encoding="utf-8",logger=None):
        if logger is None:
            self.logger = global_logger
        else:
            self.logger = logger

        dialect = unicodecsv.excel
        if tabs:
            dialect = unicodecsv.excel_tab
            if not self.filename.endswith(".tsv"):
                self.filename = filename + ".tsv"
        else:
            if not self.filename.endswith(".csv"):
                self.filename = filename + ".csv"


        self.__filehandle = open(self.filename,"rb")
        self._reader = unicodecsv.reader(self.__filehandle, encoding=encoding, dialect=dialect)
        if header is None:
            self.header = {}
            h = self._reader.next()
            for i, f in enumerate(h):
                cn = get_canonical_name(f)
                if cn[0] is not None:
                    self.header[i] = cn[0]
        else:
            self.header = header
        self.line_length = len(self.header)
예제 #30
0
  def itervoters(self):
    if self.voter_file_content:
      voter_stream = StringIO.StringIO(self.voter_file_content)
    else:
      voter_stream = open(self.voter_file.path, "rU")

    #reader = unicode_csv_reader(voter_stream)
    reader = unicodecsv.reader(voter_stream, encoding='utf-8')

    for voter_fields in reader:
      # bad line
      if len(voter_fields) < 1:
        continue
    
      return_dict = {'voter_id': voter_fields[0]}

      if len(voter_fields) > 1:
        return_dict['email'] = voter_fields[1]

      if len(voter_fields) > 2:
        return_dict['name'] = voter_fields[2]

      if len(voter_fields) > 3:
        return_dict['group'] = voter_fields[3]

      yield return_dict
예제 #31
0
파일: models.py 프로젝트: np/helios-server
    def itervoters(self):
        if self.voter_file_content:
            voter_stream = StringIO.StringIO(self.voter_file_content)
        else:
            voter_stream = open(self.voter_file.path, "rU")

        #reader = unicode_csv_reader(voter_stream)
        reader = unicodecsv.reader(voter_stream, encoding='utf-8')

        for voter_fields in reader:
            # bad line
            if len(voter_fields) < 1:
                continue

            return_dict = {'voter_id': voter_fields[0]}

            if len(voter_fields) > 1:
                return_dict['email'] = voter_fields[1]

            if len(voter_fields) > 2:
                return_dict['name'] = voter_fields[2]

            yield return_dict
예제 #32
0
def train_stats(f_name, eou='__eou__', eot='__eot__'):
    pos_utterances = []
    pos_turns = []
    pos_words = []
    neg_utterances = []
    neg_turns = []
    neg_words = []

    reader = unicodecsv.reader(open(f_name))
    next(reader)  # skip header
    for line in reader:
        if int(float(line[2])) == 1:
            pos_utterances.append(line[0].count(eou))
            pos_turns.append(line[0].count(eot))
            pos_words.append(len(line[0].split()))
        elif int(float(line[2])) == 0:
            neg_utterances.append(line[0].count(eou))
            neg_turns.append(line[0].count(eot))
            neg_words.append(len(line[0].split()))
        else:
            print line[2]

    return pos_utterances, pos_turns, pos_words, neg_utterances, neg_turns, neg_words
예제 #33
0
    def post(self, request, *args, **kwargs):
        file = request.data.get(u'file', None)
        delimiter = str(request.data.get('delimiter', None))
        encoding = request.data.get(u'encoding', None)

        if request.data.get(u'mapping'):
            mapping = json.loads(request.data.get(u'mapping'))
        else:
            mapping = dict()

        headers = csv.reader(file, encoding=encoding,
                             delimiter=delimiter).next()

        reader = csv.DictReader(file,
                                fieldnames=headers,
                                encoding=encoding,
                                delimiter=delimiter)

        result = []
        for v in self.generate(reader, mapping):
            result.append(v)

        return response.Response(data=result, status=status.HTTP_200_OK)
def download_csv_filter_output(source, dataset_id, columns):
    """
    Download CSV resource from dataset and print the columns
    """
    ckan = ckanapi.RemoteCKAN(source)
    dataset = ckan.action.package_show(id=dataset_id)
    url = dataset['resources'][0]['url']
    response = requests.get(url, stream=True)
    csv = unicodecsv.reader(response.iter_lines(), encoding='utf-8')
    # skip header row
    next(csv)
    for line in csv:
        out = {}
        for k, col_num in columns.iteritems():
            out[k] = line[col_num]
        # skip blanks
        if not out['id']:
            continue
        if out['id'] in choices:
            sys.stderr.write('duplicate id: %r!\n' % out['id'])
            continue

        choices[out['id']] = {'en': out['en'], 'fr': out['fr']}
def read_twitter_csv(src):
    """
    Reads Twitter dataset in .csv format

    Parameters
    ----------
    src: str - path to csv file.

    Returns
    -------
    dict.
    {tid:
            "text": ...
    }

    """
    data = {}
    with open(src, "rb") as f:
        reader = csv.reader(f, delimiter="\t")
        for row in reader:
            tid, text = row
            data[tid] = text
    return data
예제 #36
0
def import_expenditure():
    '''
    Update the expenditure table
    deleting any data for the city/business_type
    '''
    f = open('expenditure.csv')
    r = unicodecsv.reader(f, encoding='utf-8')
    headers = r.next()
    for row in r:
        row_dict = dict(zip(headers, row))
        assert(row_dict['BUSINESS_TYPE'] in BUSINESS_TYPES)
        Expenditure.query.filter_by(
            city=row_dict['CITY'],
            type=row_dict['BUSINESS_TYPE'],
        ).delete()
        exp = Expenditure(
            city=row_dict['CITY'],
            type=row_dict['BUSINESS_TYPE'],
            spend=row_dict['SPEND_PER_CAPITA'],
        )
        db.session.add(exp)
        db.session.commit()
    print 'Expenditure imported'
예제 #37
0
def load_model_item_mapping(item_csv_path=config.MATCHING_MODEL_DRESS_DETAILS_PATH):
    '''
    This is a hack to map from the pre-sqlalchemy IDs to the primary key IDs in
    the database. At startup, we load a mapping between the old and new IDs and
    use it when querying for Items/ItemImages.
    '''
    with open(item_csv_path, 'rb') as f:
        records = list(csv.reader(f))
        header, records = records[0], records[1:]
        original_recs = [dict(zip(header, rec)) for rec in records]
    
    original_recs_by_url = {r['detail_url']: r for r in original_recs}

    item_mapping = {}
    session = models.Session()
    for item in session.query(models.Item).all():
        mapped_item = original_recs_by_url.get(item.detail_url)
        if not mapped_item:
            print("NO MAPPING FOUND: {}".format(item.detail_url))
            continue
        item_mapping[int(mapped_item['dress_id'])] = item.id
    
    return item_mapping
예제 #38
0
def restart_harvest(args):
    harvest = get_harvest(args)
    data_dir = os.path.join(os.getcwd(), 'data', harvest)
    meta = get_metadata(data_dir)
    if meta:
        try:
            with open(os.path.join(data_dir, 'results.csv'), 'rb') as csv_file:
                reader = csv.reader(csv_file, delimiter=',', encoding='utf-8')
                rows = list(reader)
            if len(rows) > 1:
                start = len(rows) - 2
                # Remove the last row in the CSV just in case there was a problem
                rows = rows[:-1]
                with open(os.path.join(data_dir, 'results.csv'), 'wb') as csv_file:
                    writer = csv.writer(csv_file, delimiter=',', encoding='utf-8')
                    for row in rows:
                        writer.writerow(row)
            else:
                start = 0
        except IOError:
            # Nothing's been harvested
            start = 0
        start_harvest(data_dir=data_dir, key=meta['key'], query=meta['query'], pdf=meta['pdf'], text=meta['text'], start=start, max=meta['max'])
예제 #39
0
def main():
    with open('full_text_mapped.csv', 'rb') as f:
        r = csv.reader(f, delimiter='|', quotechar='@')
        data = list(r)

    results = []
    results.append(data[0] + ['polarity', 'subjectivity', 'readability'])

    data = data[1:]  # no header

    # parallel
    pool = multiprocessing.Pool(processes=4)
    results += pool.map(get_sentiment, data)

    #for x in data[0:3]:
    #	results.append(get_sentiment(x))

    with open('full_text_mapped_sentiment.csv', 'wb') as aa:
        writer = csv.writer(aa,
                            delimiter='|',
                            quotechar='@',
                            quoting=csv.QUOTE_ALL)
        writer.writerows(results)
예제 #40
0
def readCsvAndCountPercentPerFormItemFromGoogleForms(fileName):
    times = {}
    totalRows = 0
    with open(fileName, 'r') as csvfile:

        csvReader = csv.reader(csvfile, encoding='utf-8')
        csvReader.next()  # skip the first line
        for row in csvReader:
            value = row[1]
            '''
             Because I did some mistakes and fixed later on google forms
             I replace the 6 with 7 and 27 with 28 because I mistoon the dates
             of the sundays.
            '''
            value = value.replace("6", "7").replace("27", "28")
            if (value in times.keys()):
                times[value] += 1
            else:
                times[value] = 1

            totalRows += 1

        return calculateDictionaryAsPercent(times, totalRows)
예제 #41
0
파일: clients.py 프로젝트: OpenOil-UG/aleph
def _dl_csv(refresh=False):
    if refresh or not hasattr(_dl_csv, 'projects'):
        response = requests.get(CSV_URL)
        response_text = requests.utils.get_unicode_from_response(
            response).encode('ascii', 'ignore')
        csvfile = csv.reader(response_text.splitlines(), encoding='utf-8')
        projects = {}
        # skip header rows
        csvfile.next()
        csvfile.next()
        for row in csvfile:
            metadata = {
                'name': row[0],
                'search_name': row[7] or row[0],  # replacement name
                'country': row[2],
            }
            for rulename, col in RULE_COLUMNS.items():
                metadata[rulename] = row[col] == 'y'
            projects[row[0].lower()] = metadata
        _dl_csv.projects = projects
    else:
        print('getting from cache')
    return _dl_csv.projects
예제 #42
0
    def get_table(self, spreadsheet_url):
        # get list of lists where each inner list is a row in a spreadsheet

        match = re.match(r'^https://docs.google.com/spreadsheets/d/(\S+)/', spreadsheet_url)

        # not sure this is doing anything? URLValidator picking this type of issue up already?
        if not match:
            raise ValidationError("Unable to extract key from Google Sheets URL")

        try:
            url = 'https://docs.google.com/spreadsheets/d/%s/export?format=csv' % match.group(1)
            response = requests.get(url, timeout=5)
            response.raise_for_status()
        except requests.RequestException as e:
            raise ValidationError("Error talking to Google Sheets: %s" % e.message)

        rows = csv.reader(io.BytesIO(response.content), encoding='utf-8')
        rows = list(rows)

        if not rows or not rows[0]:
            raise ValidationError("Your sheet did not import successfully; please check that it is 'Published to the web' and shared with 'Anyone with the link'")
        else:
            return rows
예제 #43
0
def start_char_addition(file_name, end_char, start_char):
    with open(file_name, 'rb') as fin:
        with open(file_name.split('.')[0] + '_startchar.csv', 'w+') as fout:
            with open(file_name.split('.')[0] + '_startchar_param.csv',
                      'w+') as paramout:
                reader = csvu.reader(fin, encoding='utf-8')
                writer = csvu.writer(fout, encoding='utf-8')
                writer.writerow([start_char])
                sentences = 0
                words = 0
                for row in reader:
                    writer.writerow(row)
                    words = words + 1
                    if row == [end_char]:
                        writer.writerow([start_char])
                        sentences = sentences + 1

                par_writer = csv.writer(paramout)
                par_writer.writerow([words, sentences])

                paramout.close()
                fout.close()
                fin.close()
예제 #44
0
    def test_converts_partition_content_to_csv(self):
        # prepare partition mock
        fake_partition = MagicMock(spec=Partition)
        fake_partition.dataset.vid = 'ds1vid'
        fake_partition.datafile.headers = ['col1', 'col2']
        fake_iter = lambda: iter([{'col1': '1', 'col2': '1'}, {'col1': '2', 'col2': '2'}])
        fake_partition.__iter__.side_effect = fake_iter

        # run.
        ret = _convert_partition(fake_partition)

        # check converted partition.
        self.assertIn('package_id', ret)
        self.assertEqual(ret['package_id'], 'ds1vid')
        self.assertIn('upload', ret)
        self.assertTrue(isinstance(ret['upload'], six.StringIO))
        rows = []
        reader = unicodecsv.reader(ret['upload'])
        for row in reader:
            rows.append(row)
        self.assertEqual(rows[0], ['col1', 'col2'])
        self.assertEqual(rows[1], ['1', '1'])
        self.assertEqual(rows[2], ['2', '2'])
예제 #45
0
파일: forms.py 프로젝트: TJKenney/muckrock
 def process_data_csv(self, crowdsource):
     """Create the crowdsource data from the uploaded CSV"""
     url_validator = URLValidator()
     data_csv = self.cleaned_data['data_csv']
     doccloud_each_page = self.cleaned_data['doccloud_each_page']
     if data_csv:
         reader = csv.reader(data_csv)
         headers = [h.lower() for h in next(reader)]
         for line in reader:
             data = dict(zip(headers, line))
             url = data.pop('url', '')
             doc_match = DOCUMENT_URL_RE.match(url)
             proj_match = PROJECT_URL_RE.match(url)
             if doccloud_each_page and doc_match:
                 datum_per_page.delay(
                     crowdsource.pk,
                     doc_match.group('doc_id'),
                     data,
                 )
             elif proj_match:
                 import_doccloud_proj.delay(
                     crowdsource.pk,
                     proj_match.group('proj_id'),
                     data,
                     doccloud_each_page,
                 )
             elif url:
                 # skip invalid URLs
                 try:
                     url_validator(url)
                 except forms.ValidationError:
                     pass
                 else:
                     crowdsource.data.create(
                         url=url,
                         metadata=data,
                     )
예제 #46
0
def create(**kwargs):
    csv_file = kwargs.get('file')
    user = kwargs.get('user')
    dialect = kwargs.get('dialect', None)
    result = []
    try:
        f = codecs.open(csv_file, 'rU')
        if not dialect:
            dialect = csv.excel()
            dialect.delimiter = ';'
            dialect.skipinitialspace = True
        csv_content = csv.reader(format_lines(f), dialect)

        template = kwargs.get('template')
        account = kwargs.get('account')
        while True:
            try:
                row = csv_content.next()
                data = {}
                i = 0
                for item in row:
                    i = i + 1
                    data.update({'field%s' % i: item.strip()})

                if template == 'bradescopj':
                    _persist_bradescopj(data, account, user)
                elif template == 'itaupf':
                    _persist_itaupf(data, account, user)

                result.append(data)
            except StopIteration:
                break

    finally:
        f.close()

    return result
예제 #47
0
 def run_import(self):
     zip_model = self.env['res.better.zip']
     country_code = self.country_id.code
     config_url = self.env['ir.config_parameter'].get_param(
         'geonames.url',
         default='http://download.geonames.org/export/zip/%s.zip')
     url = config_url % country_code
     logger.info('Starting to download %s' % url)
     res_request = requests.get(url)
     if res_request.status_code != requests.codes.ok:
         raise Warning(
             _('Got an error %d when trying to download the file %s.') %
             (res_request.status_code, url))
     # Store current record list
     zips_to_delete = zip_model.search([('country_id', '=',
                                         self.country_id.id)])
     f_geonames = zipfile.ZipFile(StringIO.StringIO(res_request.content))
     tempdir = tempfile.mkdtemp(prefix='openerp')
     f_geonames.extract('%s.txt' % country_code, tempdir)
     logger.info('The geonames zipfile has been decompressed')
     data_file = open(os.path.join(tempdir, '%s.txt' % country_code), 'r')
     data_file.seek(0)
     logger.info('Starting to create the better zip entries')
     for row in unicodecsv.reader(data_file,
                                  encoding='utf-8',
                                  delimiter='	'):
         zip = self.create_better_zip(row, self.country_id)
         if zip in zips_to_delete:
             zips_to_delete -= zip
     data_file.close()
     if zips_to_delete:
         zips_to_delete.unlink()
         logger.info('%d better zip entries deleted for country %s' %
                     (len(zips_to_delete), self.country_id.name))
     logger.info('The wizard to create better zip entries from geonames '
                 'has been successfully completed.')
     return True
예제 #48
0
    def test_csv_export_extra_data(self):
        """Ensures exported CSV data matches source data"""
        qs_filter = {"pk__in": [x.pk for x in self.snapshots]}
        qs = BuildingSnapshot.objects.filter(**qs_filter)

        fields = list(_get_fields_from_queryset(qs))
        fields.append("canonical_building__id")
        fields.append('my new field')

        export_filename = export_csv(qs, fields)
        export_file = open(export_filename)

        reader = csv.reader(export_file)
        header = reader.next()

        self.assertEqual(header[len(fields) - 1], 'my new field')

        for i in range(len(self.snapshots)):
            row = reader.next()
            for j in range(len(fields)):
                field = fields[j]
                components = field.split("__")
                qs_val = qs[i]
                for component in components:
                    try:
                        qs_val = getattr(qs_val, component)
                    except AttributeError:
                        qs_val = qs_val.extra_data.get(component)
                    if qs_val == None:
                        break
                if isinstance(qs_val, Manager) or qs_val == None: qs_val = u''
                else: qs_val = unicode(qs_val)
                csv_val = row[j]
                self.assertEqual(qs_val, csv_val)

        export_file.close()
        os.remove(export_filename)
예제 #49
0
def simplify_sensor_log(sensor_log, readable=True):
    """
    Translate the given sensor log in a symbols sequence, such that sequence classification techniques can be applied.
    Notice that, for the sake of readability, we allows only for a maximum number of distinct symbols equals to the size
    of English alphabet (that is enough according to the scope of this project). Then, in that case, a mapping between
    sensor ids and letters is automatically computed.

    :type sensor_log: file
    :param sensor_log: the tab-separated file containing the sensor log.
    :param readable: whether the mapping between sensor ids and letters has to be computed or not.
    """
    file_basename = os.path.splitext(sensor_log.name)[0]
    dest = file_basename + '_simplified.txt'
    dest_dict = file_basename + '_simplified_dict.txt'
    src_reader = csv.reader(sensor_log, delimiter=LOG_ENTRY_DELIMITER)
    sensor_id_dict = {}

    with open(dest, 'w') as simplified_log:
        entry = next(src_reader, None)
        while entry is not None:
            sensor_id = entry[SENSOR_ID_POS]

            if readable:
                try:
                    translation = sensor_id_dict[sensor_id]
                except KeyError:
                    translation = SYMBOLS[len(sensor_id_dict)]
                    sensor_id_dict[sensor_id] = translation
            else:
                translation = sensor_id

            simplified_log.write(translation + '\n')
            entry = next(src_reader, None)

    with open(dest_dict, 'w') as simplified_log_dict:
        for k, v in sensor_id_dict.items():
            simplified_log_dict.write('%s \t\t %s\n' % (v, k))
def format_feature_tests(csv_filepath):
    """Read csv data, write formatted feature tests"""
    csvfile = open(csv_filepath)
    reader = csv.reader(csvfile)
    feature_filepath = csv_filepath.replace('.csv', '')
    feature_file = open(feature_filepath, 'w')
    feature_file.write(feature_file_header)
    reader.next()       # discard header
    done = False
    col_widths = []
    data = []
    row = reader.next()
    try:
        while not done:
            filename = row[0]
            feature_file.write(examples_prefix + filename + '\n')
            rows = []
            for row in reader:
                if row[0]:      # next filename
                    break
                rows.append(row)
            else:   # no break, end of file
                done = True
            col_widths = fit_column_widths(rows)
            first_col = len(rows[0]) - len(col_widths)
            data = fieldnames[first_col:]   # section heading
            feature_file.write(expand(data, col_widths))
            for row_ in rows:
                data = row_[first_col:]
                line = expand(data, col_widths)
                feature_file.write(line.encode('utf8'))
            feature_file.write('\n')
    except StandardError as exc:
        print '*** Error formatting data: {}'.format(data)
        print '    using widths: {}'.format(col_widths)
        print '    in test for {}'.format(filename)
        raise
예제 #51
0
def get_csv(infile):

    sniff_range = 4096

    sniffer = csv.Sniffer()

    dialect = sniffer.sniff(infile.read(sniff_range), delimiters=DELIMITERS)

    infile.seek(0)

    # Sniff for header
    header = sniffer.has_header(infile.read(sniff_range))

    infile.seek(0)

    # get the csv reader
    reader = csv.reader(infile, dialect)

    firstrow = next(reader)

    colnames = []

    for i, h in enumerate(firstrow):

        if len(h) > 0 and header:

            colnames.append(h)

        else:

            colnames.append('COLUMN{}'.format(i + 1))

    if not header:

        infile.seek(0)

    return (reader, colnames)
def translate_dialog_to_lists(dialog_filename):
    """
    Translates the dialog to a list of lists of utterances. In the first
    list each item holds subsequent utterances from the same user. The second level
    list holds the individual utterances.
    :param dialog_filename:
    :return:
    """

    dialog_file = open(dialog_filename, 'r')
    dialog_reader = unicodecsv.reader(dialog_file, delimiter='\t',quoting=csv.QUOTE_NONE)

    # go through the dialog
    first_turn = True
    dialog = []
    same_user_utterances = []
    #last_user = None
    dialog.append(same_user_utterances)

    for dialog_line in dialog_reader:

        if first_turn:
            last_user = dialog_line[1]
            first_turn = False

        if last_user != dialog_line[1]:
            # user has changed
            same_user_utterances = []
            dialog.append(same_user_utterances)

        same_user_utterances.append(dialog_line[3])

        last_user = dialog_line[1]

    dialog.append([dialog_end_symbol])

    return dialog
예제 #53
0
    def _build_tcm(self, sensor_log, sensor_id_pos):
        """
        Build the topological compatibility matrix associated with the given sensor log.
        
        :type sensor_log: file
        :param sensor_log: the tab-separated file containing the sensor log.
        :param sensor_id_pos: the position of the sensor id in the log entry.
        """
        sensor_log_reader = csv.reader(sensor_log, delimiter=LOG_ENTRY_DELIMITER)

        s0 = next(sensor_log_reader, None)  # consider a sliding window of two events per step
        s1 = next(sensor_log_reader, None)
        self.sensors_occurrences[s0[sensor_id_pos]] = 1
        while s0 is not None and s1 is not None:
            s0_id = s0[sensor_id_pos]
            s1_id = s1[sensor_id_pos]

            # increase sensor occurrences
            try:
                self.sensors_occurrences[s1_id] += 1
            except KeyError:
                self.sensors_occurrences[s1_id] = 1

            # add sensors ids to matrix and update succession counter
            self._add_sensor(s0_id)
            self._add_sensor(s1_id)
            self.prob_matrix[s0_id][s1_id] += 1

            # prepare next step (slide the window by one position)
            s0 = s1
            s1 = next(sensor_log_reader, None)

        for s_row in self.prob_matrix:
            for s_col in self.prob_matrix[s_row]:
                if self.prob_matrix[s_row][s_col] != 0:
                    # normalize cell value with respect to predecessor total occurrences
                    self.prob_matrix[s_row][s_col] /= self.sensors_occurrences[s_row]
예제 #54
0
def test_writerow():
    import os

    row = {}
    row['schema_name'] = u'test_schema_name'
    row['schema_title'] = u'test_schema_title'
    row['publish_date'] = u'2015-01-01'
    row['variable'] = u'test_var'
    row['title'] = u'test_title'
    row['description'] = u'test_desc'
    row['is_required'] = False
    row['is_system'] = False
    row['is_collection'] = False
    row['is_private'] = False
    row['field_type'] = False
    row['choices_string'] = u'0=test1;1=test2'
    row['order'] = 7

    output_csv = open('output.csv', 'w')
    writer = csv.writer(output_csv, encoding='utf-8')

    iform_json.writerow(writer, row)
    output_csv.close()

    output_csv = open('output.csv', 'rb')
    reader = csv.reader(output_csv, encoding='utf-8')

    test_row = reader.next()

    assert test_row[0] == u'test_schema_name'
    assert test_row[3] == u'test_var'
    assert test_row[7] == u'False'
    assert test_row[12] == u'7'

    output_csv.close()
    os.remove('output.csv')
예제 #55
0
def import_food_items_constraints_csv_as_dict(in_filepath=None, schema=None):
    """
	Import high-level goals from CSV and transform to dict objects

	Args:
		in_filepath: Input filepath containing the high-level goals description
		schema: Metadata description the input CSV schema

	Returns:
		List of dict objects describing the high level goals
	"""
    schema = schema or FOOD_ITEMS_CONSTRAINTS_CSV_SCHEMA
    in_filepath = in_filepath or 'data/food_items_constraints.csv'
    fp = open(in_filepath, "r")
    csv_reader = csv.reader(fp)
    header = next(
        csv_reader) if 'hasHeader' in schema and schema['hasHeader'] else None

    result = {}
    for data in csv_reader:
        result[data[schema['fields']['nbd_no']['index']]] = {
            'max_qty': float(data[schema['fields']['max_qty']['index']])
        }
    return result
def load_csv_dataset(filename):
    """
    Loads a csv filename as a dataset
    :param str filename: name of the file
    :return List[DataSample]: a list of DataSample
    """
    dataset = []
    with open(os.path.join(DIR_GENERATED_DATA, filename), 'rb') as file:
        reader = csv.reader(file,
                            delimiter=';',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL,
                            errors='ignore')
        for row in reader:
            id = int(row[0])
            text = row[1]
            gene = row[2]
            variation = row[3]
            try:
                real_class = int(row[4])
            except:
                real_class = None
            dataset.append(DataSample(id, text, gene, variation, real_class))
    return dataset
예제 #57
0
    def test_sponsored_grants_csv(self):
        """ Verify that sponsored grant fields can be fetched in csv format

    Setup:
      No filters selected
      All fields selected
      Format = browse

    Asserts:
      Basic success: able to iterate through response with reader
      Number of rows in results matches number of awards (gp + sponsored) in db

    """

        form = SponsoredAwardReportForm()
        post_dict = fill_report_form(form, select_fields=True, fmt='csv')
        post_dict['run-sponsored-award'] = ''

        response = self.client.post(self.url, post_dict)

        reader = unicodecsv.reader(response, encoding='utf8')
        row_count = sum(1 for row in reader)
        self.assertEqual(row_count - 2,
                         models.SponsoredProgramGrant.objects.count())
예제 #58
0
def main():
    ini_config = ConfigParser()
    ini_config.read(args.configfile)
    remote_ckan_url = ini_config.get('ckan', 'ckan.url')
    remote_apikey = ini_config.get('ckan', 'ckan.apikey')
    # Create CKAN API connector to the portal
    ckan_portal = RemoteCKAN(remote_ckan_url, apikey=remote_apikey)

    fi = open(args.csvfile, 'r')
    fo = open(args.outfile, 'w')

    csv_in = unicodecsv.reader(fi, encoding='utf-8')
    csv_out = unicodecsv.writer(fo, encoding='utf-8')
    csv_out.writerow(csv_in.next())
    for row in csv_in:
        # Look up the package in CKAN
        try:
            pkg = ckan_portal.action.package_show(id=row[0])
            # If the record does not exist, then a NotFound exception will be thrown
            row[2] = pkg['org_title_at_publication'][args.lang]
            row[1] = pkg['title_translated'][args.lang]
            csv_out.writerow(row)
        except NotFound, e:
            pass
    def _find_segments_old(self, sensor_log):
        """
        Find segments in the given sensor log (old version).

        :type sensor_log: file
        :param sensor_log: the tab-separated file containing the sensor log.
        """
        sensor_log_reader = csv.reader(sensor_log,
                                       delimiter=LOG_ENTRY_DELIMITER)

        s0 = next(sensor_log_reader,
                  None)  # consider a sliding window of two events per step
        s1 = next(sensor_log_reader, None)
        segment = [list(s0)]
        while s0 is not None and s1 is not None:
            s0_id = s0[self.sensor_id_pos]
            s1_id = s1[self.sensor_id_pos]

            if self.top_compat_matrix.prob_matrix[s0_id][
                    s1_id] >= self.compat_threshold:
                # the direct succession value is above the threshold
                segment.append(list(s1))  # continue the segment
            else:
                # the direct succession value is under the threshold
                if len(
                        segment
                ) >= self.noise_threshold:  # only segments longer than a threshold are considered
                    self.segments.append(
                        list(segment))  # store a copy of the segment so far
                segment = [
                    list(s1)
                ]  # start the new segment from the second item in the window

            # prepare next step (slide the window by one position)
            s0 = s1
            s1 = next(sensor_log_reader, None)
예제 #60
0
def parse_prev_consumption(_filename, _path):
    meter_data = {}
    with open(path.join(_path, _filename), 'r') as f:
        data = csv.reader(f, encoding="utf-8")
        x = 0
        for row in data:
            if x == 0:
                x += 1  # skip first row
                continue
            meter_id = row[0]
            if meter_id:
                consumption = row[2]
                if not consumption:
                    consumption = 0
                consumption = float(consumption)
                """
                    meter_data = dict of dicts of arrays
                """
                try:  # find previously inserted value
                    meter_data[meter_id] += consumption
                except KeyError:  # add new meter data
                    meter_data[meter_id] = consumption
                    # when we create a HH we need a new username
    return meter_data