def generate_results(res, classes_name): """ Converts the predictions to csv and creates the 'output.csv' file in the resources folder :param res: :param classes_name: :return: void """ toOutput = [] for i in range(len(res)): toOutput.append({'Id': i, 'Category': classes_name[res[i]]}) convert_to_csv(toOutput)
def start(): print "importing worldbank data..." db.insert("source", {"name": "World Bank"}) utils.convert_to_csv( os.path.join("data", "worldbank", "IND_Country_MetaData_en_EXCEL.xls"), os.path.join("data", "worldbank")) # import dataset with open( os.path.join( "data", "worldbank", "IND_Country_MetaData_en_EXCEL-sheet2.csv")) as datafile: reader = csv.reader(datafile.read().splitlines()) for i, row in enumerate(reader): if i == 0: continue row = [unicode(c, "utf-8", errors="ingore") for c in row] db.insert_dataset({ "name": row[1][:150], "title": row[1], "description": row[2], "source_info": row[3], "source": "World Bank" }) # import data with open( os.path.join( "data", "worldbank", "IND_Country_MetaData_en_EXCEL-sheet1.csv")) as datafile: reader = csv.reader(datafile.read().splitlines()) db.insert("region", {"name": "India"}) for i, row in enumerate(reader): if i == 0: headers = row for year in row[2:]: db.insert("period", {"name": year}) else: for ci, value in enumerate(row): if ci > 1 and utils.flt(value): db.insert( "data", { "dataset": row[0], "period": headers[ci], "value": value, "region": "India", }) if i % 100 == 0: sys.stdout.write(".") sys.stdout.flush()
def download(): properties.load_properties() for i in range(61)[39:]: print "for page %s" % i response = requests.get(sources[0] + "?page=%s" % i, verify=False) page_properties = get_url_title_and_description_from_html(response.text) for filename in page_properties: filepath = os.path.join("data", "data.gov.in", filename) if not os.path.exists(filepath): try: url = urllib2.urlopen(page_properties[filename]["url"]) size = url.headers["Content-Length"] if int(size) < int(max_file_size): with open(filepath, "wb") as datafile: r = requests.get(page_properties[filename]["url"]) for chunk in r.iter_content(1024): datafile.write(chunk) else: print "[ignored] [too big] %s (%s)" % (filename, size) except urllib2.HTTPError, e: print e if os.path.exists(filepath): files = convert_to_csv(filepath, os.path.join("data", "csv")) for fpath in files: prepend_property_headers(fpath, page_properties[filename])
def testfile(self, test_records): """Returns a filename with test data saved.""" tmpfile = NamedTemporaryFile() assert os.path.exists(tmpfile.name) with open(tmpfile.name, 'w') as tmpfilewriter: data_csv = convert_to_csv(test_records) tmpfilewriter.write(data_csv) return tmpfile
def save(self, records=None): """Saves records to slice file.""" records_list = list() if records is not None: records_list = records else: if not self._records: return for channeled_records in self._records.values(): records_list.extend(channeled_records) records_list = sorted(records_list, key=lambda record: record[0]) data_csv = convert_to_csv(records_list) blob = self._bucket.blob(self._filename) blob.upload_from_string(data_csv)
def write_to_tmpfile(self, records): """Writes records to tmp file. Args: records: A list of records. Returns: An IO object. """ tmpfile = NamedTemporaryFile() with open(tmpfile.name, 'w') as filewriter: data_csv = convert_to_csv(records) filewriter.write(data_csv) assert os.path.exists(tmpfile.name) return tmpfile
def write_to_tmpfile(self, records_to_be_written): """Writes records in a temperary file. Args: records_to_be_written: A list of records. Returns: A fileIO object for that temp file. """ tmpfile = NamedTemporaryFile() assert os.path.exists(tmpfile.name) with open(tmpfile.name, 'w') as tmpfilewriter: data_csv = convert_to_csv(records_to_be_written) tmpfilewriter.write(data_csv) return tmpfile
def download(): sourcepath = os.path.join("app", "downloads", "data.gov.in") not os.path.exists(sourcepath) and os.makedirs(sourcepath) for i in range(415)[165:]: print "for page %s" % i response = requests.get(sources[0] + "?page=%s" % i, verify=False) page_properties = get_url_title_and_description_from_html( response.text) for filename, properties in page_properties.iteritems(): filepath = os.path.join(sourcepath, filename) filecontent = "" print "downloading " + filename if not os.path.exists(filepath): try: url = urllib2.urlopen(properties["url"].encode("utf-8")) size = url.headers["Content-Length"] if int(size) < int(max_file_size_in_kb * 1024): r = requests.get(properties["url"]) with open(filepath, "w") as datafile: for chunk in r.iter_content(1024): datafile.write(chunk) else: print "[ignored] [too big] %s (%s)" % (filename, size) except urllib2.HTTPError, e: print e if os.path.exists(filepath): if filepath.split(".")[-1] == "xls": try: files = convert_to_csv(filepath, sourcepath) # remove orignal xls file (not needed) os.remove(filepath) # keep as a marker that this file is downloaded os.system("touch %s" % filepath) except Exception, e: files = [] print e else: files = [filepath] for fpath in files: prepend_property_headers(fpath, properties)
def upload(filename, records): """Uploads records to bucket in the given file name and returns a bucket object. Args: filename: A string for name of the file. records: A list of records. Returns: GCS bucket object. """ client = storage.Client() bucket = client.bucket(TEST_BUCKET) if records is not None: blob = bucket.blob(filename) blob.upload_from_string(convert_to_csv(records)) return bucket
def download(): sourcepath = os.path.join("app", "downloads", "data.gov.in") not os.path.exists(sourcepath) and os.makedirs(sourcepath) for i in range(415)[165:]: print "for page %s" % i response = requests.get(sources[0] + "?page=%s" % i, verify=False) page_properties = get_url_title_and_description_from_html(response.text) for filename, properties in page_properties.iteritems(): filepath = os.path.join(sourcepath, filename) filecontent = "" print "downloading " + filename if not os.path.exists(filepath): try: url = urllib2.urlopen(properties["url"].encode("utf-8")) size = url.headers["Content-Length"] if int(size) < int(max_file_size_in_kb * 1024): r = requests.get(properties["url"]) with open(filepath, "w") as datafile: for chunk in r.iter_content(1024): datafile.write(chunk) else: print "[ignored] [too big] %s (%s)" % (filename, size) except urllib2.HTTPError, e: print e if os.path.exists(filepath): if filepath.split(".")[-1]=="xls": try: files = convert_to_csv(filepath, sourcepath) # remove orignal xls file (not needed) os.remove(filepath) # keep as a marker that this file is downloaded os.system("touch %s" % filepath) except Exception, e: files = [] print e else: files = [filepath] for fpath in files: prepend_property_headers(fpath, properties)
if (predictions[i]['Category'] == result[i]): count += 1 return count / len(predictions) def define_alpha(self, validation_comments, validation_result): """ Helper function to find a good value for hyper param alpha """ alpha = [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01] result = np.zeros(len(alpha)) for i in range(len(alpha)): print('Alpha ', i + 1, '/', len(alpha), ' : ', alpha[i]) predict = bayes_classifier.predict(validation_comments, alpha[i]) result[i] = bayes_classifier.score(predict, validation_result) print(result[i]) print(result) print(alpha[np.argmax(result)]) return alpha[np.argmax(result)] if __name__ == "__main__": train_data = read_train_data() test_data = read_test_data() comment = train_data[0] result = train_data[1] bayes_classifier = BayesClassifier() alpha_star = 0.01 bayes_classifier.train(comment, result) predictions = bayes_classifier.predict(test_data, alpha_star) convert_to_csv(predictions)
def test_convert_to_csv(self, test_records, test_csv_records): """Tests on convert_to_csv""" for length in range(len(test_records)): assert convert_to_csv(test_records[:length + 1]) == test_csv_records[length]
def generate_results(res, classes_name): toOutput = [] for i in range(len(res)): toOutput.append({'Id': i, 'Category': classes_name[res[i]]}) convert_to_csv(toOutput)