def guesstypes(filename, has_header, headers, dialect, missing_values): print 'guessing types..' if filename.endswith('.gz'): csvfile = gzip.open(filename, 'rb') else: csvfile = open(filename, 'rb') reader = DictReader(csvfile, fieldnames=headers, dialect=dialect) if has_header: reader.next() types = defaultdict(int) # 0 = numeric, 1 = categoric # values=defaultdict(set) line = 0 for row in reader: for k, v in row.iteritems(): if (v is not None) and (v.lower() not in missing_values): if not is_number(v): types[k] = 1 line += 1 if line > 1000: break # values[k].add(v) csvfile.close() return types
def skipHeaders(reader): """ skip over the header lines in the csv documents so we can access the data """ # skip 'comment' section at header of DATA_FILE currentLine = reader.next() while (currentLine[0][0] == HEADER_TAG): currentLine = reader.next()
def localization_table(reader): reader.next() # pop the header table = {} for row in reader: key = L8nKey(*row[0:2]) row = L8nRow(*row[4:7]) table[key] = row return table
def csv2json(filename, has_header, headers, dialect, missing_values,types): if filename.endswith('.gz'): csvfile=gzip.open(filename,'rb') else: csvfile=open(filename,'rb') reader = DictReader(csvfile,fieldnames=headers, dialect=dialect) if has_header: reader.next() for row in reader: # try to convert all numbers # filter out missing cells event = dict([(k,convert(k,v,types)) for k,v in row.iteritems() if (v is not None) and (v.lower() not in missing_values)]) yield event csvfile.close()
def csv2json(filename, has_header, headers, dialect, missing_values, types): if filename.endswith('.gz'): csvfile = gzip.open(filename, 'rb') else: csvfile = open(filename, 'rb') reader = DictReader(csvfile, fieldnames=headers, dialect=dialect) if has_header: reader.next() for row in reader: # try to convert all numbers # filter out missing cells event = dict([(k, convert(k, v, types)) for k, v in row.iteritems() if (v is not None) and (v.lower() not in missing_values) ]) yield event csvfile.close()
def guesstypes(filename, has_header, headers, dialect, missing_values): print 'guessing types..' if filename.endswith('.gz'): csvfile=gzip.open(filename,'rb') else: csvfile=open(filename,'rb') reader = DictReader(csvfile,fieldnames=headers, dialect=dialect) if has_header: reader.next() types=defaultdict(int) # 0 = numeric, 1 = categoric # values=defaultdict(set) line=0 for row in reader: for k,v in row.iteritems(): if (v is not None) and (v.lower() not in missing_values): if not is_number(v): types[k]=1 line+=1 if line>1000: break # values[k].add(v) csvfile.close() return types
def schema(self, vid): """Render documentation for a single bundle.""" from csv import reader from StringIO import StringIO import json template = self.env.get_template('bundle/schema.html') b_data = self.doc_cache.bundle(vid) b = self.library.bundle(vid) reader = reader(StringIO(b.schema.as_csv())) del b_data['partitions'] del b_data['tables'] schema = dict(header=reader.next(), rows=[x for x in reader]) return self.render(template, b=b_data, schema=schema, **self.cc())
from csv import reader import numpy as np # vocab size vocabulary_size = 8000 vocabulary_size = 8000 unknown_token = "UNKNOWN_TOKEN" sentence_start_token = "SENTENCE_START" sentence_end_token = "SENTENCE_END" # Read the data and append SENTENCE_START and SENTENCE_END tokens print "Reading CSV file..." with open('data/reddit-comments-2015-08.csv', 'rb') as f: reader = csv.reader(f, skipinitialspace=True) reader.next() # Split full comments into sentences sentences = itertools.chain( *[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader]) # Append SENTENCE_START and SENTENCE_END sentences = [ "%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences ] print "Parsed %d sentences." % (len(sentences)) # Tokenize the sentences into words tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] # Count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
from csv import reader from pymongo import MongoClient filename = "C:\\Users\\Greg\\Desktop\\Flask Tutorial\\twitterApp\\twitterSampleData.csv" reader = reader(open(filename, 'r')) data = [] titleRow = reader.next() for row in reader: dataDict = {} for key, value in zip(titleRow, row): dataDict[key] = value data.append(dataDict) client = MongoClient() db = client['twitterData'] posts = db.posts posts.insert(data) client.close()
#imports from csv import reader import os # user params relative_path = '../Data/text' limit = 10 # initialize path dir = os.path.dirname(__file__) abs_path = os.path.join(dir, relative_path) # open file print '\n------------------' print 'Open File...' file = open(abs_path) reader = reader(file, dialect="excel-tab") reader.next() # loop over lines i = 0 for line in reader: # show line print line # early exit if i>limit: break
def header(filename,delimiter=","): datafile = open(filename,"rU") reader = csv.reader(datafile,delimiter=delimiter) header = reader.next() datafile.close() return header
# file is in unicode and must be decoded when read back into Python for further processing with open('comments.csv','ab') as f: #NEEDS TO BE DECODED FOR UTF-8 WHEN READ BACK IN writer=csv.writer(f) for yarn_index, permalink, num_comments in izip(yarn_stats.index, yarn_stats["permalink"], yarn_stats["num_comments"]): if num_comments < 100: writeFewYarnComments(yarn_index, permalink, br) else: writeManyYarnComments(yarn_index, permalink, num_comments, br) #to get comments and yarn IDs in long format from csv import reader new_comments_index = [] new_comments = [] with open('comments.csv', 'rb') as f_in: #read back in the file we just created reader = reader(f_in) reader.next() #skip header row in csv for row in reader: this_index = row.pop(0) #get yarn index row_mod = [s.decode('utf-8') for s in row if s != ''] this_index = [this_index for s in row_mod] new_comments.extend(row_mod) new_comments_index.extend(this_index) #print new_comments comms_with_index = pd.DataFrame({'comment' : new_comments}, index=new_comments_index) comms_with_index.to_csv('comments_with_yarn_ids.csv', encoding='utf-8', index_label='yarn_id') #to get comments and yarn IDs with each set of comments collapsed (to one document per yarn) from csv import reader new_comments_index = [] new_comments_collapsed = []