예제 #1
0
def guesstypes(filename, has_header, headers, dialect, missing_values):
    print 'guessing types..'
    if filename.endswith('.gz'):
        csvfile = gzip.open(filename, 'rb')
    else:
        csvfile = open(filename, 'rb')

    reader = DictReader(csvfile, fieldnames=headers, dialect=dialect)
    if has_header:
        reader.next()
    types = defaultdict(int)  # 0 = numeric, 1 = categoric
    #		values=defaultdict(set)
    line = 0
    for row in reader:
        for k, v in row.iteritems():
            if (v is not None) and (v.lower() not in missing_values):
                if not is_number(v):
                    types[k] = 1
        line += 1
        if line > 1000: break


#						values[k].add(v)
    csvfile.close()
    return types
예제 #2
0
def skipHeaders(reader):
    """ skip over the header lines in the csv documents so we can access
    the data """

    # skip 'comment' section at header of DATA_FILE
    currentLine = reader.next()
    while (currentLine[0][0] == HEADER_TAG):
        currentLine = reader.next()
예제 #3
0
def localization_table(reader):
    reader.next() # pop the header
    table = {}
    for row in reader:
        key = L8nKey(*row[0:2])
        row = L8nRow(*row[4:7])
        table[key] = row
    return table
예제 #4
0
def csv2json(filename, has_header, headers, dialect, missing_values,types):
	if filename.endswith('.gz'):
		csvfile=gzip.open(filename,'rb')
	else:
		csvfile=open(filename,'rb')

	reader = DictReader(csvfile,fieldnames=headers, dialect=dialect)
	if has_header:
		reader.next()
	for row in reader:
		# try to convert all numbers 
		# filter out missing cells
		event = dict([(k,convert(k,v,types)) for k,v in row.iteritems() if (v is not None) and (v.lower() not in missing_values)])
		yield event
	csvfile.close()
예제 #5
0
def csv2json(filename, has_header, headers, dialect, missing_values, types):
    if filename.endswith('.gz'):
        csvfile = gzip.open(filename, 'rb')
    else:
        csvfile = open(filename, 'rb')

    reader = DictReader(csvfile, fieldnames=headers, dialect=dialect)
    if has_header:
        reader.next()
    for row in reader:
        # try to convert all numbers
        # filter out missing cells
        event = dict([(k, convert(k, v, types)) for k, v in row.iteritems()
                      if (v is not None) and (v.lower() not in missing_values)
                      ])
        yield event
    csvfile.close()
예제 #6
0
def guesstypes(filename, has_header, headers, dialect, missing_values):
	print 'guessing types..'
	if filename.endswith('.gz'):
		csvfile=gzip.open(filename,'rb')
	else:
		csvfile=open(filename,'rb')

	reader = DictReader(csvfile,fieldnames=headers, dialect=dialect)
	if has_header:
		reader.next()
	types=defaultdict(int) # 0 = numeric, 1 = categoric
#		values=defaultdict(set)
	line=0
	for row in reader:
		for k,v	in row.iteritems():
			if (v is not None) and (v.lower() not in missing_values):
				if not is_number(v):
					types[k]=1
		line+=1
		if line>1000: break
#						values[k].add(v)
	csvfile.close()
	return types
예제 #7
0
파일: render.py 프로젝트: Gudinya/ambry
    def schema(self, vid):
        """Render documentation for a single bundle."""
        from csv import reader
        from StringIO import StringIO
        import json

        template = self.env.get_template('bundle/schema.html')

        b_data = self.doc_cache.bundle(vid)

        b = self.library.bundle(vid)

        reader = reader(StringIO(b.schema.as_csv()))

        del b_data['partitions']
        del b_data['tables']

        schema = dict(header=reader.next(), rows=[x for x in reader])

        return self.render(template, b=b_data, schema=schema, **self.cc())
예제 #8
0
from csv import reader
import numpy as np

# vocab size
vocabulary_size = 8000

vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

# Read the data and append SENTENCE_START and SENTENCE_END tokens
print "Reading CSV file..."
with open('data/reddit-comments-2015-08.csv', 'rb') as f:
    reader = csv.reader(f, skipinitialspace=True)
    reader.next()
    # Split full comments into sentences
    sentences = itertools.chain(
        *[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    sentences = [
        "%s %s %s" % (sentence_start_token, x, sentence_end_token)
        for x in sentences
    ]
print "Parsed %d sentences." % (len(sentences))

# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
from csv import reader
from pymongo import MongoClient

filename = "C:\\Users\\Greg\\Desktop\\Flask Tutorial\\twitterApp\\twitterSampleData.csv"
reader = reader(open(filename, 'r'))

data = []
titleRow = reader.next()

for row in reader:
    dataDict = {}

    for key, value in zip(titleRow, row):
        dataDict[key] = value

    data.append(dataDict)

client = MongoClient()
db = client['twitterData']

posts = db.posts
posts.insert(data)

client.close()
#imports
from csv import reader
import os

# user params
relative_path = '../Data/text'
limit = 10

# initialize path
dir = os.path.dirname(__file__)
abs_path = os.path.join(dir, relative_path)

# open file
print '\n------------------'
print 'Open File...'
file = open(abs_path)
reader = reader(file, dialect="excel-tab")
reader.next()

# loop over lines
i = 0
for line in reader:

    # show line
    print line

    # early exit
    if i>limit:
        break
예제 #11
0
def header(filename,delimiter=","):
  datafile = open(filename,"rU")
  reader = csv.reader(datafile,delimiter=delimiter)
  header = reader.next()
  datafile.close()
  return header
# file is in unicode and must be decoded when read back into Python for further processing
with open('comments.csv','ab') as f: #NEEDS TO BE DECODED FOR UTF-8 WHEN READ BACK IN
    writer=csv.writer(f) 
    for yarn_index, permalink, num_comments in izip(yarn_stats.index, yarn_stats["permalink"], yarn_stats["num_comments"]):
        if num_comments < 100:
            writeFewYarnComments(yarn_index, permalink, br)
        else:
            writeManyYarnComments(yarn_index, permalink, num_comments, br)

#to get comments and yarn IDs in long format
from csv import reader
new_comments_index = []
new_comments = []
with open('comments.csv', 'rb') as f_in:   #read back in the file we just created
    reader = reader(f_in)
    reader.next()     #skip header row in csv
    for row in reader:
        this_index = row.pop(0)     #get yarn index
        row_mod = [s.decode('utf-8') for s in row if s != '']   
        this_index = [this_index for s in row_mod]
        new_comments.extend(row_mod)
        new_comments_index.extend(this_index)
        #print new_comments

comms_with_index = pd.DataFrame({'comment' : new_comments}, index=new_comments_index)
comms_with_index.to_csv('comments_with_yarn_ids.csv', encoding='utf-8', index_label='yarn_id')              

#to get comments and yarn IDs with each set of comments collapsed (to one document per yarn)
from csv import reader
new_comments_index = []
new_comments_collapsed = []