Пример #1
0
	def summarise(self):
		"""scan the combined CSV file and total up the number of error, warning etc.
		
		also record the total number of "events" per component."""
		
		self.events = {}
		self.components = {}
	
		if self.limit > 0:
			csv.field_size_limit(self.limit)
			
		reader = csv.reader(open(self.csv, "rb"))
		for row in reader:
		
			event = row[0]
			if event == "info" and row[2] == "version":
				self.raptor_version = row[3]
				continue
			
			if event in self.events:
				self.events[event] += 1
			else:
				self.events[event] = 1
			
			bldinf = row[1]
			if bldinf in self.components:
				self.components[bldinf] += 1
			else:
				self.components[bldinf] = 1
			
		if self.verbose:
			for (event, count) in self.events.items():
				print("{0} : {1}".format(event, count))
			print("{0} components".format(len(self.components)))
Пример #2
0
def parseCSVfile(filename):
	global no_of_printed_st
	global no_of_empty_st

	csv.field_size_limit(10000000) # because we deal with huge csv file
	with open(filename, 'rb') as data:
		reader = csv.reader(data)
		try:
			try:
				# create a new file or **overwrite an existing file**.
				new_file = filename.rstrip('.csv') + ".txt"
				print new_file
				f = open(new_file, "w")
			except IOError:
	            		pass
			try:
				for row in reader:
					st_str = "".join(row) # stack trace as string
					st_list = st_str.split("\n") # in list 
					st_dict = keepSTIntoDict(st_list) # in dictionary
					r_st_list = reverseSTDictValues(st_dict) # reversed values for each key
					# print the stack trace only if the reversed list is not empty
					if isr_st_listEmpty(r_st_list) == False:
						printValidST(f, r_st_list)
						checkUnknownExceptionExistence(r_st_list)
						no_of_printed_st = no_of_printed_st + 1 # increase the number of printed stack traces
					else:
						no_of_empty_st = no_of_empty_st + 1 # increase the number of empty stack traces
			finally:
				f.close()
		except csv.Error as e:
			sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
Пример #3
0
    def __init__(self, f, encoding='utf-8', maxfieldsize=None, **kwargs):
        f = UTF8Recoder(f, encoding)

        self.reader = csv.reader(f, **kwargs)

        if maxfieldsize:
            csv.field_size_limit(maxfieldsize)
Пример #4
0
    def readVCF(self, data):
        print "interpreting as a VCF file"
        csv.field_size_limit(1000000000)
        # load a tab delimited file of SNPs -> genotypes (like 23andme generates)
        reader = csv.reader(data, delimiter='\t')
        print "length of file: ", reader.line_num
        i = 0
        for row in reader:
            # print i
            i = i + 1
            if '#' in row[0]:
                print row
                continue
            if (len(row) == 10):
                rsid = row[2]
                ref = row[3]
                alt = row[4]
                data = row[9]
                allele1 = data[0]
                allele2 = data[2]
                genotype = ""
                if allele1[0] == '0': genotype = genotype + ref
                elif allele1[0] == '1': genotype = genotype + alt
                else:
                    print allele1, " issue with ", rsid
                    print row

                if allele2[0] == '0': genotype = genotype + ref
                elif allele2[0] == '1': genotype = genotype + alt
                else:
                    print allele2, " issue with ", rsid
                    print row

                self.SNPs[rsid] = genotype  # set SNP -> genotype value
        print "done reading VCF file"
Пример #5
0
def writetest(Xpreds, fil='testresultsNN.csv') :
	import csv
	csv.field_size_limit(1000000000)
	outwriter = csv.writer(open(fil,'w'),delimiter=",")
	rows = np.arange(0,len(Xpreds))
	for row in rows :
		outwriter.writerow([row+1,Xpreds[row]])
Пример #6
0
def readCSV(resultfile):
	csv.field_size_limit(sys.maxint)
	csvReader = csv.reader(open(resultfile,"rb"),delimiter=',')
	results = []
	for row in csvReader:
		results.append(row)
	return (results[0],results[1:])
Пример #7
0
  def Load(self, kind, data):
    """Parses CSV data, uses a Loader to convert to entities, and stores them.

    On error, fails fast. Returns a "bad request" HTTP response code and
    includes the traceback in the output.

    Args:
      kind: a string containing the entity kind that this loader handles
      data: a string containing the CSV data to load

    Returns:
      tuple (response code, output) where:
        response code: integer HTTP response code to return
        output: string containing the HTTP response body
    """
    Validate(kind, basestring)
    Validate(data, basestring)
    output = []

    try:
      loader = Loader.RegisteredLoaders()[kind]
    except KeyError:
      output.append('Error: no Loader defined for kind %s.' % kind)
      return (httplib.BAD_REQUEST, ''.join(output))

    buffer = StringIO.StringIO(data)
    reader = csv.reader(buffer, skipinitialspace=True)

    try:
      csv.field_size_limit(800000)
    except AttributeError:
      pass

    return self.LoadEntities(self.IterRows(reader), loader)
Пример #8
0
def _get_city_db():
    csv.field_size_limit(sys.maxsize)
    cities_file = os.path.join(os.path.dirname(__file__), 'cities.txt')
    with open(cities_file, 'rt') as f:
        r = csv.reader(f, delimiter='\t')
        city_db = list(r)
        return city_db
Пример #9
0
def loadTraces(fileName):
  """
  Load netwrok traces from CSV
  :param fileName: (str) name of the file
  :return traces: (dict) network traces. E.g: activeCells, sensorValues, etc.
  """

  csv.field_size_limit(sys.maxsize)

  with open(fileName, 'rb') as fr:
    reader = csv.reader(fr)
    headers = reader.next()

    traces = dict()
    for field in headers:
      traces[field] = []

    for row in reader:
      for i in range(len(row)):
        if len(row[i]) == 0:
          data = []
        else:
          if headers[i] in ['tmPredictedActiveCells',
                            'tpActiveCells',
                            'tmActiveCells']:
            if row[i] == '[]':
              data = []
            else:
              data = map(int, row[i][1:-1].split(','))
          else:
            data = float(row[i])
        traces[headers[i]].append(data)

  return traces
Пример #10
0
def writetest(idx,Xpreds, fil='NN.512.256.64.csv') :
	import csv
	csv.field_size_limit(1000000000)
	outwriter = csv.writer(open(fil,'w'),delimiter=",")
	rows = np.arange(0,len(Xpreds))
	for row in rows :
		outwriter.writerow([int(idx[row]),Xpreds[row]])
Пример #11
0
def main(train_file, test_file):
  #print "loading data.."
  csv.field_size_limit(1310720)
  trainreader = csv.reader (open( '/home/kiran/kdd/train.csv' ))
  projectid, traindata_old = zip (*trainreader)  

  testreader = csv.reader (open ('/home/kiran/kdd/test.csv'))
  projectid, testdata_old = zip (*testreader)


  # remove stopwords
  traindata = []
  testdata = []
  for observation in traindata_old:
      traindata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False))
  for observation in testdata_old:
      testdata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False))

  tfv = CountVectorizer (binary=1,ngram_range=(1, 1))
  X_all = traindata + testdata
  lentrain = len(traindata)
  tfv.fit(X_all)
  X_all = tfv.transform(X_all)
  X = X_all[:lentrain]
  X_test = X_all[lentrain:]
  scipy.io.mmwrite ('x_train_bin_1gram.mtx', X, field = 'real')
  scipy.io.mmwrite ('x_test_bin_1gram.mtx', X_test, field = 'real')
  myCols = tfv.get_feature_names ()
  myCols = DataFrame (myCols)
  myCols.to_csv ('bin_1gram.csv', index=False)
Пример #12
0
def dic_gen(file):
	
	dict_info = {}
	
	csv.field_size_limit(1000000000)
	reader = csv.reader(open(file), delimiter = ' ')

	for row in reader:
		
		intron = row[0]
		coverage = int(row[1])
		chr = row[2]
		strand = row[3]
		istart = row[4]
		iend = row[5]
		ilength = int(row[6])
		dn = row[7]
		dn_type = row[8]
		dn_type_score = row[9]
		reads = row[10]

		total_introns.add((intron, chr, strand, istart, iend, ilength, dn, dn_type, dn_type_score))
		dict_info[intron] =  [coverage, ilength]       #Para los cDNA_EST el ilenght es el coverage de EST 		
	
	return dict_info
Пример #13
0
def main(introns_final_table):
	csv.field_size_limit(1000000000)

	reader1 = csv.reader(open(introns_final_table), delimiter = ' ')
		
	dn_type = defaultdict(int)
	Total = 0
	dns = []
		
	for row in reader1:
		intron = row[0]
		dn = row[7]
		dn_type[dn] += 1
		Total += 1
		dns.append(dn)
		
	print "TOTAL =", Total 
	print "Dinucleotide_TYPE", "Number", "%"
		
	dn_frec = dn_type.items()
	dn_frec.sort(key=lambda x: x[1])
		
	for i in reversed(dn_frec):
		dn = i[0]
		frec = i[1]
			
		print dn, frec, percent(frec, Total)
Пример #14
0
    def parsecsv(self, fname, upperbound = None):
        """Parse CSV file containing talk data.  This should be replaceable
           with something that talks to the backing database containing the
           actual data.
        """
        csv.field_size_limit(sys.maxsize)

        with open(fname, 'r') as csvfile:
            talkreader = csv.reader(csvfile)

            rownum = 0
            header = []
            results = []
            
            for row in talkreader:
                result = {}
                if rownum == 0:
                    header = row
                elif upperbound is not None and rownum > upperbound:
                    break
                else:
                    for i,column in enumerate(header):
                        result[column] = row[i]
                        
                    result['text'] = self.parsebody(result['body'])

                    results.append(result)
                rownum += 1
            return results
Пример #15
0
def __tagProcessTask(filename, savefile):
    import csv
    import sys
    import os
    import time
    csv.field_size_limit(10000000)

    b = time.time()
    f = open(filename,'rU')
    col = dict((cn, i) for i, cn in enumerate(f.readline().split(',')))
    tagRules = __getTagRules(col)
    reader = csv.reader(f,quoting=csv.QUOTE_NONE)
    data = []
    for line in reader:
            r = __tagProcess(tagRules, line)
            if r:
                data.extend(r)
    
    e = time.time()
    f.close()
    f = open(savefile,'w')
    f.writelines(['%s-%s-%d\n' %(d['userId'],d['tag'],d['tagId']) for d in data])
    f.close()
    d = ["('%s',%d, now())" %(d['userId'],d['tagId']) for d in data]
    i = 0
    c = len(d)
    #print c
    while i < c:
        s = ','.join(d[i:i+5000])
        letv_db.executesql("INSERT INTO tagLog (userId,tagId, date) VALUES %s" %s)
        letv_db.commit()
        i += 5000
        
    return 'all time: %f, data len: %d, csv line: %d' %((e-b), len(data), reader.line_num)
Пример #16
0
    def raw(self, sample=False):
        def rows():
            for line in self._sample:
                if PY2:
                    yield line.encode('utf-8')
                else:
                    yield line
            if not sample:
                for line in self.lines:
                    if PY2:
                        yield line.encode('utf-8')
                    else:
                        yield line

        # Fix the maximum field size to something a little larger
        csv.field_size_limit(256000)

        try:
            for row in csv.reader(rows(),
                                  dialect=self._dialect, **self._overrides):
                yield [Cell(to_unicode_or_bust(c)) for c in row]
        except csv.Error as err:
            if u'newline inside string' in unicode_string(err) and sample:
                pass
            elif u'line contains NULL byte' in unicode_string(err):
                pass
            else:
                raise messytables.ReadError('Error reading CSV: %r', err)
Пример #17
0
def scrubcsv(fnamein, fnameout, i):
    csv.field_size_limit(100000000)
    infile = csv.reader(open(fnamein, 'r'))
    outfile = csv.writer(open(fnameout, 'w'))
    for row in infile:
        if len(row) == i:
            outfile.writerow(row)
Пример #18
0
def reada(filename="index.csv"):
    csv.field_size_limit(1000000000)  ##problem, Error: field larger than field limit (131072)
    ## http://lethain.com/entry/2009/jan/22/handling-very-large-csv-and-xml-files-in-python/
    foor=read(filename)
    gci = get_column_index
    # augment with noteid to make augmented 
    return [ [Note.objects.filter(owner=x[gci('owner_id')],jid=x[gci('jid')])[0].id] + x for x in foor if not x[gci('primary')] == '-no idea-']
Пример #19
0
def parse_uploaded(f):
    
    try:
        
        logging.info("siamo partiti cazzo")
        
        csv.field_size_limit(1000000000)
        
        # 1. getting file encoding
        result = chardet.detect(f.read())
        encoding = result['encoding']

        # 2. determing dialect
        f.open()
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(f.read())
        dialect.delimiter = "\t"

        # 3. encoding file
        f.open()
        utf8_file = f.read().decode(encoding).encode('utf-8')
        reader = csv.DictReader( utf8_file.splitlines(), dialect=csv.excel_tab )

        # 4. get results
        results = [row for row in reader]
    
    except Exception, e:
        logging.info(str(e))
Пример #20
0
def IntronExtractor(bed12):
         # row[0]  chr
         # row[1]  alignment start
         # row[2]  alignment end
         # row[3]  name 
         # row[4]    
         # row[5]  strand
         # row[6]  aligment start
         # row[7]  aligment end
         # row[8]  
         # row[9] blocknum
         # row[10] blocksizes
         # row[11] qstarts  

	

	for row in csv.reader(open(bed12), delimiter = '\t'):
		
		csv.field_size_limit(1000000000)

		chr = row[0]
		start = row[1]
		end = row[2]
		strand = row[5]
		bn = int(row[9])
		name = row[4]

		if strand=="+":

			print "\t".join([chr, start, start, name, str(0), strand])

		elif strand=="-":

			print "\t".join([chr, end, end, name, str(0), strand])
Пример #21
0
def uploadFromFile():
    
    dir = os.path.dirname(os.path.abspath(__file__))
    filepath = os.path.join(dir, 'test.txt')
    f = open(filepath,"r")
    
    csv.field_size_limit(1000000000)
    
    # getting file encoding
    result = chardet.detect(f.read())
    encoding = result['encoding']
            
    # determing dialect
    f.seek(0)
    sniffer = csv.Sniffer()
    dialect = sniffer.sniff(f.read())
    dialect.delimiter = "\t"

    # encoding file
    f.seek(0)
    utf8_file = f.read().decode(encoding).encode('utf-8')
    reader = csv.DictReader( utf8_file.splitlines(), dialect=csv.excel_tab )
    rows = list(reader)
    
    # get results
    results = []
    
    for i, row in enumerate(rows):
        results.append(row)
        progress = 100 * float(i)/float(len(rows))
        current_task.update_state(state='PROGRESS', meta={'current': i, 'total': len(rows), 'progress': progress })

    current_task.update_state(state='SUCCESS')
    return results
Пример #22
0
def DRcounter(file):
	reader = csv.reader(open(file), dialect='excel-tab' )
	csv.field_size_limit(1000000000)
	for row in reader:
		SJ5=row[15]
		SJ3=row[16]
		L=len(SJ5)/2
		SJ5U = SJ5[:L]
		SJ5D = SJ5[L:]
		SJ3U = SJ3[:L]
		SJ3D = SJ3[L:]
		DRU = 0
		DRD = 0
		if SJ5U==SJ3U:
			DRU = L
		else:
			while SJ5U[L-1-DRU]==SJ3U[L-1-DRU]:
				DRU = (DRU + 1)
				if  SJ5U[L-1-DRU]!=SJ3U[L-1-DRU]: 
					break
		if SJ5D==SJ3D:
			DRD = L
		else:
			 while SJ5D[DRD]==SJ3D[DRD]:
                                DRD = (DRD + 1)
				if SJ5D[DRD]!=SJ3D[DRD]:
					break


		print row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9], row[10], row[11], SJ5U, SJ5D, SJ3U, SJ3D, DRU, DRD, DRU+DRD	
Пример #23
0
def do_insert_from_csv(insert, filename):
    import csv
    csv.field_size_limit(2**31)

    with open(filename, 'r', encoding="UTF=8") as commits_file:
        for row in tqdm(csv.reader(commits_file)):
            insert(*row)
Пример #24
0
def splitByScaff(pileUpFileName, opDir):
    csv.field_size_limit(sys.maxint)
    with open(pileUpFileName, 'r') as pileUpFile:
        pileUpReader = csv.reader(pileUpFile, delimiter='\t')
        prevScaffName = ''
        currScaffFile = None
        currScaffWriter = None
        for row in pileUpReader:
            scaffName = row[Pileup_Consts.SCAFF_NAME]
            scaffNum  = int(scaffName[-5:])
            if scaffNum > 818:
                continue
            if prevScaffName == scaffName:
                #continue to write in current scaff file
                currScaffWriter.writerow(row)
            else:
                #found a new scaff, close old file
                if currScaffFile is not None:
                    currScaffFile.close()
                #open and write new file
                currScaffFile = open(\
                    os.path.join(opDir, scaffName + '.mpileup'), 'w')
                currScaffWriter = csv.writer(currScaffFile, delimiter='\t')
                currScaffWriter.writerow(row)
                prevScaffName = scaffName
Пример #25
0
def load_traces(file_name):
  """
  Load network traces from CSV
  :param file_name: (str) name of the file
  :return traces: (dict) network traces. E.g: activeCells, sensorValues, etc.
  """

  csv.field_size_limit(sys.maxsize)

  with open(file_name, 'rb') as fr:
    reader = csv.reader(fr)
    headers = reader.next()

    traces = dict()
    for field in headers:
      traces[field] = []

    for row in reader:
      for i in range(len(row)):
        if row[i] == '':
          data = None
        else:
          data = json.loads(row[i])
        traces[headers[i]].append(data)

  return traces
Пример #26
0
def csvload(fileName):
    csvfile = open(fileName, 'r')
    csv.field_size_limit(CSV_FILE_LIMIT)
    rdr = csv.reader(csvfile, dialect='excel', quotechar=str('"'))
    if not csv.Sniffer().has_header(csvfile.readline()):
        rdr.seek(0)
    return rdr, csvfile, fileName
Пример #27
0
    def parse_csv(self, doc, delim=','):
        """
        Csv reader
        =====
        Function to read in a csv file
        
        Parameters
        -----
        doc : str
            The name of the csv file

        Returns
        -----
        lines : list of lists
            Each list corresponds to the cell values of a row
        """
        csv.field_size_limit(sys.maxsize)
        try:
            lines = []
            with open(doc, 'r', encoding = 'utf-8') as csvfile:
                csv_reader = csv.reader(csvfile, delimiter = delim)
                for line in csv_reader:
                    lines.append(line)
        except:
            lines = []
            csvfile = open(doc, 'r', encoding = 'utf-8')
            csv_reader = csv.reader(line.replace('\0','') for line in csvfile.readlines())       
            for line in csv_reader:
                lines.append(line)
        return lines
Пример #28
0
    def get_uniprot_entrez_id_map(self):
        logger.info("Mapping Uniprot ids to Entrez/ENSEMBL gene ids")
        import sys
        id_map = {}
        file = '/'.join((self.rawdir, self.files['id-map']['file']))
        with gzip.open(file, 'rb') as csvfile:
            csv.field_size_limit(sys.maxsize)
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t', quotechar='\"')
            for row in filereader:
                (uniprotkb_ac, uniprotkb_id, geneid, refseq, gi, pdb, go,
                 uniref100, unifref90, uniref50, uniparc, pir, ncbitaxon, mim,
                 unigene, pubmed, embl, embl_cds, ensembl, ensembl_trs,
                 ensembl_pro, other_pubmed) = row

                if int(ncbitaxon) not in self.tax_ids:
                    continue
                if geneid.strip() != '':
                    idlist = re.split(r';', geneid)
                    id_map[
                        uniprotkb_ac.strip()] = [
                            'NCBIGene:'+i.strip() for i in idlist]
                elif ensembl.strip() != '':
                    idlist = re.split(r';', ensembl)
                    id_map[
                        uniprotkb_ac.strip()] = [
                            'ENSEMBL:'+i.strip() for i in idlist]

        logger.info("Acquired %d uniprot-entrez mappings", len(id_map))

        return id_map
Пример #29
0
def fake_import(request):
    
    dir = os.path.dirname(os.path.abspath(__file__))
    filepath = os.path.join(dir, 'test.txt')
    f = open(filepath,"r")
    
    csv.field_size_limit(1000000000)
    
    # getting file encoding
    result = chardet.detect(f.read())
    encoding = result['encoding']
            
    # determing dialect
    f.seek(0)
    sniffer = csv.Sniffer()
    dialect = sniffer.sniff(f.read())
    dialect.delimiter = "\t"

    # encoding file
    f.seek(0)
    utf8_file = f.read().decode(encoding).encode('utf-8')
    reader = csv.DictReader( utf8_file.splitlines(), dialect=csv.excel_tab )
    rows = list(reader)
    
    # get results
    results = []
    
    for i, row in enumerate(rows):
        results.append(row)
    
    response = create_objects('items', results)

    return HttpResponse(json.dumps(response, default=bson.json_util.default), mimetype="application/json")
Пример #30
0
 def __init__(self, logfile, overwrite):
   if overwrite:
     self.file_object = open(logfile, 'w', 1)
   else:
     self.file_object = open(logfile, 'a', 1)
   csv.field_size_limit(sys.maxsize)
   self.log_file = csv.writer(self.file_object, delimiter=',', quotechar='|', escapechar='\\', quoting=csv.QUOTE_MINIMAL)
Пример #31
0
import csv, json, datetime as dt, time
from pprint import pprint
from tensorflow.keras.preprocessing.text import text_to_word_sequence as ttws
csv.field_size_limit(500000)
#https://github.com/first20hours/google-10000-english/blob/master/20k.txt

#dataSource = "source/requests_data.csv"
dataSource = "../data/LastMonthRequests.csv"
commonWordslist = "../data/90.txt"
incidentNumberIndex = 0  #1
summaryIndex = 1  #4
tier1Index = 2  #9
tier2Index = 3  #10
tier3Index = 4  #11
paginationLimit = 100000
recordsLimit = [300000]
wordsCounter = 0
requestDict = {}
accessDict = {}
fileDict = {}
lotusDict = {}
outlookDict = {}
emailDict = {}
awsDict = {}
azureDict = {}
sqlDict = {}


def filterText(record, field):
    cleanWords = set(ttws(record[field]))
    for word in cleanWords:
from __future__ import print_function
from sys import stdout, maxsize
import csv

maxInt = 2147483647
decrement = True

while decrement:
    # decrease the maxInt value by factor 10
    # as long as the OverflowError occurs.
    # http://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072

    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt / 10)
        decrement = True

verb = False
"""
Doc :
    A class to read the SWATH scoring logs from Peakview and OpenSWATH

    The data is returned from the "parse_files" functions as a list of runs.
    Each run contains a number of precursors and each precursors contains
    peakgroups (e.g. peakgroups that were found in chromatographic space).

    usage: 
    reader = SWATHScoringReader.newReader(options.infiles, options.file_format)
import os
import sys
import numpy as np
from collections import defaultdict
import cPickle as pickle
import pandas as pd
import csv
from scipy import spatial
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import string
import codecs
import argparse

csv.field_size_limit(sys.maxsize)
reload(sys)
sys.setdefaultencoding('utf-8')

parser = argparse.ArgumentParser()
parser.add_argument('--bursty_issues_dir',
                    help='Folder containing burst wise issues',
                    default='Sample_Data/burst_issues/')
parser.add_argument('--raw_commit_dir',
                    help='Directory containing raw commit files per project',
                    default='Sample_Data/raw_commits/')
parser.add_argument(
    '--ins_del_count_dir',
    help=
    'Directory containing insertion/deletion counts per commit per project',
    default='Sample_Data/ModRequest/insertion_deletion_counts/')
Пример #34
0
def combine_cluster_labels(config: configparser.SectionProxy):
    # If you have a partially-processed dataset disable the parts you have
    # already done
    write_joined_cluster_labels = True
    write_cluster_epsilon_representative_docid = True

    logger.info("Reading intermediate file")
    hierarchyMaxEpsilon: int = int(get_common_config()['HierarchyMaxEpsilon'])
    csv.field_size_limit(2**23)

    fin = open(config['IntermediateColumnFilteredFileName'],
               'r',
               encoding='utf-8')
    fin.readline()
    fout = open(config['OutputDocumentMetadataFileName'],
                'w',
                encoding='utf-8',
                newline='')

    logger.info("Writing document metadata file")
    csvreader = csv.reader(fin, delimiter=',')
    csvwriter = csv.writer(fout, delimiter=',')
    csvwriter.writerow(['documentId, firstScrapedDate, title'])
    hash_doc = {}
    for line in csvreader:
        (documentId, firstScrapedDate, title, domain, text) = line
        hash_doc[documentId] = title
        csvwriter.writerow([documentId, firstScrapedDate, title])
    fout.close()
    fin.close()

    logger.info("Finished writing document metadata")

    fin = open(get_epsilon_cluster_filename(config, hierarchyMaxEpsilon), 'r')
    hash_clusterhit = {}
    for line2 in fin:
        clusterid = int(line2.split(',')[0])
        docid = line2.split(',')[1].strip()

        hash_clusterhit[docid] = str(clusterid)

    fin.close()

    if write_joined_cluster_labels:
        logger.info("Writing joined cluster labels")
        fout = open(config['OutputJoinedClusterLabelsFileName'], 'w')
        fout.write('clusterid,docid,epsilon\n')
        hash_clustereps: Dict[Tuple[str, int], List[str]] = {}
        hash_clusterstats: Dict[Tuple[str, int], int] = {}
        for x in range(0, int(hierarchyMaxEpsilon) + 1):
            fin = open(get_epsilon_cluster_filename(config, x), 'r')
            for line3 in fin:
                # Get the epsilon and the clusterid
                clusterid = int(line3.split(',')[0])

                ky = (str(x), clusterid)
                # Track the size of the cluster
                if ky not in hash_clusterstats:
                    hash_clusterstats[ky] = 0
                # for computing size later
                hash_clusterstats[ky] += 1

                docid = line3.split(',')[1].strip()

                if ky not in hash_clustereps:
                    hash_clustereps[ky] = []
                hash_clustereps[ky].append(docid)
                fout.write(line3.strip() + ',' + str(x) + '\n')
            fin.close()
        fout.close()

    if write_cluster_epsilon_representative_docid:
        logger.info("Writing cluster epsilon representative docids")
        fout = open(config['IntermediateClusterEpsilonRepresentativeDocId'],
                    'w',
                    encoding='utf-8')
        _ = fout.write(
            'epsilon,clusterid,representativedocid,ancestorclusterid,size,titlesummary\n'
        )
        for ky2 in hash_clustereps:
            repdoc: Optional[str] = None
            maxtitlehits = 0
            hash_titles = {}
            bow_arry = []
            # Skip the -1 clusters.  These are nodes/documents that do not fit in a cluster
            if ky2[1] == -1:
                continue
            # iterate through for each cluster/eps pairing and get the most frequently occuring title
            for doc in hash_clustereps[ky2]:
                title = hash_doc[doc]
                bow_arry.append(title)
                if repdoc is None:
                    repdoc = doc
                    maxtitlehits = 1
                    hash_titles[title] = (doc, 1)
                    continue
                else:
                    if title in hash_titles:
                        pair = hash_titles[title]
                        hits = 1 + pair[1]
                        if hits > maxtitlehits:
                            maxtitlehits = hits
                            repdoc = pair[0]
                            # save the incremented value
                            hash_titles[title] = (pair[0], hits)
                    else:
                        hash_titles[title] = (doc, 1)
            count_vec = sklearn.feature_extraction.text.CountVectorizer(
                'content', stop_words='english')
            try:
                model = count_vec.fit(bow_arry)
                invmodel = {v: k for k, v in model.vocabulary_.items()}
                fit_matrix = count_vec.transform(bow_arry)

                # can make this a lot faster if we just sum by column!
                # Summation by column
                topterms = []
                columnsums = fit_matrix.sum(axis=0)
                # Get the top items

                arryidx = numpy.array(columnsums)[0]
                # Sort the array by index - keep the top 10 terms
                sortedarry = numpy.flip(numpy.argsort(arryidx), axis=0)[0:10]
                # Then pull out the vocab terms
                for vocab_key in sortedarry:
                    topterms.append(invmodel[vocab_key])
                # Join them together for a summary
                termconcat2 = '\t'.join(topterms)
            except Exception:
                termconcat2 = ''

            _ = fout.write(
                f'{str(ky2[0])},{str(ky2[1])},{repdoc},{str(hash_clusterhit[str(repdoc)])},'
                f'{str(hash_clusterstats[ky2])},{termconcat2}\n')
        fout.close()
        logger.info("Finished combine_cluster_labels")
Пример #35
0
from Orange.data import (
    _io,
    is_discrete_values,
    MISSING_VALUES,
    Table,
    Domain,
    Variable,
    DiscreteVariable,
    StringVariable,
    ContinuousVariable,
    TimeVariable,
)
from Orange.util import Registry, flatten, namegen

# Support values longer than 128K (i.e. text contents features)
csv.field_size_limit(100 * 1024 * 1024)

__all__ = ["Flags", "FileFormat"]

_IDENTITY = lambda i: i


class Compression:
    """Supported compression extensions"""

    GZIP = ".gz"
    BZIP2 = ".bz2"
    XZ = ".xz"
    all = (GZIP, BZIP2, XZ)

Пример #36
0
#!/usr/bin/env python

import sys
import csv
import errno

csv.field_size_limit(
    sys.maxsize)  # Or else it cannot handle fields longer than 131072

tabin = csv.reader(sys.stdin, dialect=csv.excel_tab)
commaout = csv.writer(sys.stdout, dialect=csv.excel, lineterminator='\n')

try:
    for row in tabin:
        commaout.writerow(row)
except IOError as e:
    if e.errno == errno.EPIPE:
        pass
    else:
        raise
import sys
import csv
csv.field_size_limit(sys.maxint)
from itertools import repeat


def select_data(data_num, percentage):
    new_data_num = []
    for i in data_num:
        new_data_num.append(int(i * percentage))
    return new_data_num


def data_select(
    chunk, fold, user_select, file
):  #fold = can generate percentage of the data for test and for training
    user_log_num = []
    count = 1
    cur_user = 0
    index_count = 0
    index = 0
    with open(file, 'rb') as tsvin:
        Input = csv.reader(tsvin, delimiter='\t')
        for row in Input:
            if (cur_user != row[0]):
                cur_user = row[0]
                user_log_num.append(count)
                count = 1
            else:
                count = count + 1
        user_log_num.append(count)
Пример #38
0
def main():
    parser = argparse.ArgumentParser(
        description = 'VSM - Feature Extraction')

    help_msgs = []
    # Input arguments #
    help_msgs.append('corpus_pp path')
    help_msgs.append('predefined vectors path')
    help_msgs.append('word embeddings path')
    
    # Output arguments #
    help_msgs.append('features vsm path')

    default_paths = [join(getcwd(), 'vsm')]

    # Input arguments #
    parser.add_argument('corpus_pp_path', help=help_msgs[0])
    parser.add_argument('predefined_vectors_path', help=help_msgs[1])
    parser.add_argument('word_embeddings_path', help=help_msgs[2])

    # Output arguments #
    parser.add_argument('--output', action='store', metavar='PATH',
                        default=default_paths[0], help=help_msgs[3],
                        dest='output_path')

    # Arguments parsing #
    args = parser.parse_args()

    # Check if input exists and is a directory. Otherwise, exit
    # No extra indentation.
    if not isdir(args.corpus_pp_path):
        sys.exit('The input path does not point to a valid directory')
    # Create the 'Output' directory if it does not exist #
    if not isdir(args.output_path):
        makedirs(args.output_path)

    corpus = []
    users = []
    register_dialect('tab', delimiter='\t')

    filepath = join(args.corpus_pp_path, 'corpus_pp.tsv')
    field_size_limit(sys.maxsize)
    with open(filepath, 'rt') as fp:
        r = reader(fp, dialect='tab')
        # Discard Header #
        next(r)
        # * username
        # * content_pp
        for row in r:
            username = row[0]
            content_pp = row[1]
            corpus.append(content_pp)
            users.append(username)

    vectorizer = CountVectorizer()
    # In case you are interested in trying other types other weighting schemes
    # vectorizer = TfidfVectorizer()

    term_doc_mtx = vectorizer.fit_transform(corpus).toarray()
    idx2term = vectorizer.get_feature_names()
    print(' -- Term-Doc Matrix Created --')

    # Load Personality (big-5) Vectors #
    filepath = join(args.predefined_vectors_path, 'pb5.vec')
    pb5_vec = load_predefined_vectors(filepath)

    # Load Personality Disorders Vectors #
    filepath = join(args.predefined_vectors_path, 'pd.vec')
    pd_vec = load_predefined_vectors(filepath)

    # Load word embeddings #
    word_embb = KeyedVectors.load_word2vec_format(args.word_embeddings_path, binary=True)
    print(' -- Word Embeddings loaded --')
    
    # For each row of the document-term matrix: #
    # * For each term t_i in document d_j #
    # * Compute sim(t_i, v_k) where v is a word in pb5_vector/pd_vector #
    # * Compute the average of the similarities obtained #

    x = vsm(idx2term, term_doc_mtx, pb5_vec, pd_vec, word_embb)
    # Replace nan with zero #
    np.nan_to_num(x, copy=False)
    filepath = join(args.output_path, 'personality_scores')
    np.save(filepath, x)

    filepath = join(args.output_path, 'users.txt')
    with open(filepath, 'w') as fp:
        for username in users:
            fp.write('%s\n' % username)

    return 0
Пример #39
0
def main():

    args = parse_args()
    csv.field_size_limit(sys.maxsize)

    print("A) Load data")
    transform = transforms.Compose([
        transforms.Resize(255),
        transforms.CenterCrop(224),
        transforms.ToTensor()
    ])

    if args.tiny:
        train_data = CocoCaptions(args.train_img_path,
                                  args.train_ann_file,
                                  transform=transform,
                                  split="tiny")
        dev_data = CocoCaptions(args.train_img_path,
                                args.train_ann_file,
                                transform=transform,
                                split="tiny")
    elif args.restval:
        train_data = CocoCaptions((args.train_img_path, args.dev_img_path),
                                  (args.train_ann_file, args.dev_ann_file),
                                  transform=transform,
                                  split="restval")
        dev_data = CocoCaptions(args.dev_img_path,
                                args.dev_ann_file,
                                transform=transform,
                                split="dev")
    else:
        train_data = CocoCaptions(args.train_img_path,
                                  args.train_ann_file,
                                  transform=transform,
                                  split="train")
        dev_data = CocoCaptions(args.dev_img_path,
                                args.dev_ann_file,
                                transform=transform,
                                split="dev")

    print("B) Load model")
    if args.model == "vse++":
        raise NotImplementedError
    elif args.model == "univse":
        if args.simple:
            model = simp_univse.UniVSE.from_filename(args.vocab_file,
                                                     train_cnn=args.train_cnn)
        else:
            model = univse.UniVSE.from_filename(args.vocab_file,
                                                train_cnn=args.train_cnn)
            model.vocabulary_encoder.add_graphs(args.graph_file)
        # Randomize modifier
        model.vocabulary_encoder.modif = torch.nn.Embedding(
            len(model.vocabulary_encoder.corpus), 100)
        model.vocabulary_encoder.modif.weight.data.uniform_(-0.1, 0.1)
        model.vocabulary_encoder.modif.weight.data[
            model.vocabulary_encoder.train_corpus_length:] = torch.zeros(
                (len(model.vocabulary_encoder.corpus) -
                 model.vocabulary_encoder.train_corpus_length, 100))
    else:
        print("ERROR: model name unknown."
              )  # You shouldn't be able to reach here!
        return

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    # Observe that all parameters are being optimized
    optimizer = optim.Adam(model.params, lr=args.lr)
    lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                  milestones=[args.lr_update],
                                                  gamma=0.1)

    print("C) Train model")
    train_params = {
        'batch_size': args.batch_size,
        'shuffle': True,
        'num_workers': 6
    }
    train_gen = data.DataLoader(train_data, **train_params)

    dev_params = {
        'batch_size': args.batch_size,
        'shuffle': False,
        'num_workers': 6
    }
    dev_gen = data.DataLoader(dev_data, **dev_params)

    train_losses = []
    dev_losses = []

    ir_r1_1k, ir_r5_1k, ir_r10_1k = [], [], []
    tr_r1_1k, tr_r5_1k, tr_r10_1k = [], [], []
    ir_r1_5k, ir_r5_5k, ir_r10_5k = [], [], []
    tr_r1_5k, tr_r5_5k, tr_r10_5k = [], [], []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_modif_emb = copy.deepcopy(model.vocabulary_encoder.modif)
    best_rsum = 0

    t_epoch = tqdm(range(1, args.epochs + 1), desc="Epoch")
    for epoch in t_epoch:

        if epoch > 2:
            model.criterion.n_r = 1.0

        # Each epoch has a training and validation phase
        for phase in ['train', 'dev']:
            if phase == 'train':
                generator = train_gen
                model.train_start()  # Set model to training mode
            else:
                generator = dev_gen
                model.val_start()  # Set model to evaluate mode

            running_loss = 0.0
            idx = 0

            img_embeddings = np.zeros((len(dev_data), args.hidden_size))
            cap_embeddings = np.zeros((len(dev_data), args.hidden_size))
            count = 0

            # Iterate over data.
            t_batch = tqdm(generator, desc="Batch", leave=False)
            for img, sent in t_batch:

                sentences = list(sent)
                embeddings = model(img, sentences)

                time_start = time.time()
                total_loss, other_loss = model.criterion(embeddings)
                if not args.simple:
                    model.times["loss"] += time.time() - time_start

                # ####### DEBUG ######## #
                if not args.simple and epoch == 1 and idx == 100:
                    with open("times.txt", "w") as t_file:
                        t_file.write(f" # EPOCH {epoch}\t# BATCH {idx} #\n")
                        t_file.write(
                            f"Image:  {model.times['image'] * 1000 / model.times['n']} ms\n"
                        )
                        t_file.write(
                            f"Input:  {model.times['input'] * 1000 / model.times['n']} ms\n"
                        )
                        t_file.write(
                            f"Vocab:  {model.times['vocab'] * 1000 / model.times['n']} ms\n"
                        )
                        t_file.write(
                            f"Object: {model.times['object'] * 1000 / model.times['n']} ms\n"
                        )
                        t_file.write(
                            f"Neural: {model.times['neural'] * 1000 / model.times['n']} ms\n"
                        )
                        t_file.write(
                            f"Compos: {model.times['comp'] * 1000 / model.times['n']} ms\n"
                        )
                        t_file.write(
                            f"Unflat: {model.times['unflatten'] * 1000 / model.times['n']} ms\n"
                        )
                        t_file.write(
                            f"Loss:   {model.times['loss'] * 1000 / model.times['n']} ms\n"
                        )
                        t_file.write(f"\n")

                if phase == "dev":
                    aux_count = count + embeddings["sent_emb"].size(0)
                    img_embeddings[count:aux_count] = embeddings[
                        "img_emb"].data.cpu().numpy().copy()
                    cap_embeddings[count:aux_count] = embeddings[
                        "sent_emb"].data.cpu().numpy().copy()
                    count = aux_count

                if phase == "train":
                    optimizer.zero_grad()
                    total_loss.backward()
                    if model.grad_clip > 0:
                        clip_grad_norm_(model.params, model.grad_clip)
                    optimizer.step()
                    lr_scheduler.step(epoch - 1)

                total_loss = float(total_loss.data.cpu().numpy())
                t_batch.set_description(f"Batch Loss: {total_loss:.6f}")
                running_loss += total_loss
                idx += 1

            running_loss /= idx

            if phase == "train":
                train_losses.append(running_loss)
            else:
                dev_losses.append(running_loss)

                # Compute R@k values for 1K Validation
                rt = itr.i2t(img_embeddings[:5000],
                             cap_embeddings[:5000],
                             measure='cosine',
                             return_ranks=False)
                ri = itr.t2i(img_embeddings[:5000],
                             cap_embeddings[:5000],
                             measure='cosine',
                             return_ranks=False)
                current_rsum_1k = ri[0] + ri[1] + ri[2] + rt[0] + rt[1] + rt[2]

                ir_r1_1k.extend([ri[0]])
                ir_r5_1k.extend([ri[1]])
                ir_r10_1k.extend([ri[2]])

                tr_r1_1k.extend([rt[0]])
                tr_r5_1k.extend([rt[1]])
                tr_r10_1k.extend([rt[2]])

                # Compute R@k values for 5K Validation
                rt = itr.i2t(img_embeddings,
                             cap_embeddings,
                             measure='cosine',
                             return_ranks=False)
                ri = itr.t2i(img_embeddings,
                             cap_embeddings,
                             measure='cosine',
                             return_ranks=False)

                current_rsum = ri[0] + ri[1] + ri[2] + rt[0] + rt[1] + rt[2]
                t_epoch.set_description(
                    f"Epoch RSum: {current_rsum_1k:.1f} (1K) / {current_rsum:.1f} (5K)"
                )

                ir_r1_5k.extend([ri[0]])
                ir_r5_5k.extend([ri[1]])
                ir_r10_5k.extend([ri[2]])

                tr_r1_5k.extend([rt[0]])
                tr_r5_5k.extend([rt[1]])
                tr_r10_5k.extend([rt[2]])

                # Deep copy the model if it's the best rsum
                if current_rsum > best_rsum:
                    del best_modif_emb, best_model_wts
                    best_rsum = current_rsum
                    best_modif_emb = copy.deepcopy(
                        model.vocabulary_encoder.modif)
                    best_model_wts = copy.deepcopy(model.state_dict())

                # Plot recall@k values
                if args.plot and epoch > 1:
                    fig = plotter.plot_recall_curve(
                        range(1, epoch + 1),
                        ir_r1_1k,
                        ir_r5_1k,
                        ir_r10_1k,
                        title="Image Retrieval (1K)")
                    plt.savefig(
                        os.path.join(
                            args.output_path,
                            f"training_recalls_{args.model}_ir_1k.png"))
                    plt.close(fig)

                    fig = plotter.plot_recall_curve(
                        range(1, epoch + 1),
                        tr_r1_1k,
                        tr_r5_1k,
                        tr_r10_1k,
                        title="Text Retrieval (1K)")
                    plt.savefig(
                        os.path.join(
                            args.output_path,
                            f"training_recalls_{args.model}_tr_1k.png"))
                    plt.close(fig)

                    fig = plotter.plot_recall_curve(
                        range(1, epoch + 1),
                        ir_r1_5k,
                        ir_r5_5k,
                        ir_r10_5k,
                        title="Image Retrieval (5K)")
                    plt.savefig(
                        os.path.join(
                            args.output_path,
                            f"training_recalls_{args.model}_ir_5k.png"))
                    plt.close(fig)

                    fig = plotter.plot_recall_curve(
                        range(1, epoch + 1),
                        tr_r1_5k,
                        tr_r5_5k,
                        tr_r10_5k,
                        title="Text Retrieval (5K)")
                    plt.savefig(
                        os.path.join(
                            args.output_path,
                            f"training_recalls_{args.model}_tr_5k.png"))
                    plt.close(fig)

            # Save intermediate loss and recall plots after the second epoch
            if args.plot and phase == "dev" and epoch > 1:
                fig = plotter.plot_loss_curve(range(1, epoch + 1),
                                              train_losses,
                                              dev_losses,
                                              yexp=True)
                plt.savefig(
                    os.path.join(args.output_path,
                                 f"training_losses_{args.model}.png"))
                plt.close(fig)

    model.load_state_dict(best_model_wts)
    model.save_model(os.path.join(args.output_path, f"best_{args.model}.pth"))

    model.vocabulary_encoder.modif = best_modif_emb
    model.vocabulary_encoder.save_corpus(
        os.path.join(args.output_path, f"best_corpus_{args.model}.pickle"))

    with open(os.path.join(args.output_path, "losses.pickle"), "wb") as f:
        losses = {"train": train_losses, "dev": dev_losses}
        pickle.dump(losses, f)

    with open(os.path.join(args.output_path, "recalls_at_k.pickle"),
              "wb") as f:
        recalls_at_k = {
            "ir_r1_1k": ir_r1_1k,
            "ir_r5_1k": ir_r5_1k,
            "ir_r10_1k": ir_r10_1k,
            "tr_r1_1k": tr_r1_1k,
            "tr_r5_1k": tr_r5_1k,
            "tr_r10_1k": tr_r10_1k,
            "ir_r1_5k": ir_r1_5k,
            "ir_r5_5k": ir_r5_5k,
            "ir_r10_5k": ir_r10_5k,
            "tr_r1_5k": tr_r1_5k,
            "tr_r5_5k": tr_r5_5k,
            "tr_r10_5k": tr_r10_5k
        }
        pickle.dump(recalls_at_k, f)
Пример #40
0
sys.path.insert(0,
                '/edx/app/hadoop/pipeline/local/lib/python2.7/site-packages')

from edx.analytics.tasks.common.mapreduce import MapReduceJobTask, MapReduceJobTaskMixin
from edx.analytics.tasks.common.mysql_load import MysqlInsertTask
from edx.analytics.tasks.common.sqoop import SqoopImportFromMysql
from edx.analytics.tasks.export.database_exports import FIELD_SIZE_LIMIT, StudentModuleRecord
from edx.analytics.tasks.util import csv_util
from edx.analytics.tasks.util.url import get_target_from_url, url_path_join

log = logging.getLogger(__name__)

# Increase maximum number of characters per field since we have
# entries that easily exceed the default value of 124 KB.
csv.field_size_limit(FIELD_SIZE_LIMIT)


######################################################################
#       Abstract Import and Histogram Calculation Section            #
######################################################################
class HistogramTaskFromSqoopParamsMixin(object):
    """
    Mixin the parameters for HistogramsFromStudentModule that involve Sqoop

    """
    name = luigi.Parameter(description='Name of this run', )
    dest = luigi.Parameter(
        description='URL of S3 location/directory where the task outputs', )
    credentials = luigi.Parameter(
        config_path={
Пример #41
0
    def setUp(self):
        self.lim = csv.field_size_limit()

        with open('.test.csv', 'w') as f:
            f.write('a' * 10)
Пример #42
0
    def __init__(self,
                 streamID,
                 write=False,
                 fields=None,
                 missingValues=None,
                 bookmark=None,
                 includeMS=True,
                 firstRecord=None):
        """
    streamID:
        CSV file name, input or output
    write:
        True or False, open for writing if True
    fields:
        a list of nupic.data.fieldmeta.FieldMetaInfo field descriptors, only
        applicable when write==True
    missingValues:
        what missing values should be replaced with?
    bookmark:
        a reference to the previous reader, if passed in, the records will be
        returned starting from the point where bookmark was requested. Either
        bookmark or firstRecord can be specified, not both. If bookmark is used,
        then firstRecord MUST be None.
    includeMS:
        If false, the microseconds portion is not included in the
        generated output file timestamp fields. This makes it compatible
        with reading in from Excel.
    firstRecord:
        0-based index of the first record to start reading from. Either bookmark
        or firstRecord can be specified, not both. If bookmark is used, then
        firstRecord MUST be None.

    Each field is a 3-tuple (name, type, special or FieldMetaSpecial.none)

    The name is the name of the field. The type is one of the constants in
    `FieldMetaType`. The special is one of the `FieldMetaSpecial` values
    that designate their field as the sequenceId, reset, timestamp, or category.
    With exception of multiple categories, there can be at most one of each.
    There may be multiple fields of type datetime, but no more than one of them
    may be the timestamp field (FieldMetaSpecial.timestamp). The sequence id
    field must be either a string or an int. The reset field must be an int (and
    must contain 0 or 1).

    The category field must be an int or space-separated list of ints, where
    the former represents single-label classification and the latter is for
    multi-label classification (e.g. "1 3 4" designates a record for labels 1,
    3, and 4). The number of categories is allowed to vary record to record;
    sensor regions represent non-categories with -1, thus the category values
    must be >= 0.

    The FileRecordStream iterates over the field names, types and specials and
    stores the information.
    """
        super(FileRecordStream, self).__init__()

        # Only bookmark or firstRow can be specified, not both
        if bookmark is not None and firstRecord is not None:
            raise RuntimeError(
                "Only bookmark or firstRecord can be specified, not both")

        if fields is None:
            fields = []
        if missingValues is None:
            missingValues = ['']

        # We'll be operating on csvs with arbitrarily long fields
        size = 2**27
        csv.field_size_limit(size)

        self._filename = streamID
        # We can't guarantee what system files are coming from, use universal
        # newlines
        self._write = write
        self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE
        self._file = open(self._filename, self._mode)
        self._sequences = set()
        self.rewindAtEOF = False

        if write:
            assert fields is not None
            assert isinstance(fields, (tuple, list))
            # Verify all fields are 3-tuple
            assert all(
                isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3
                for f in fields)
            names, types, specials = zip(*fields)
            self._writer = csv.writer(self._file)
        else:
            # Make sure readline() works on windows too
            os.linesep = '\n'
            # Read header lines
            self._reader = csv.reader(self._file, dialect="excel")
            try:
                names = [n.strip() for n in self._reader.next()]
            except:
                raise Exception('The header line of the file %s contained a NULL byte' \
                                % self._filename)
            types = [t.strip() for t in self._reader.next()]
            specials = [s.strip() for s in self._reader.next()]

            # If there are no specials, this means there was a blank line
            if len(specials) == 0:
                specials = [""]

        if not len(names) == len(types) == len(specials):
            raise Exception('Invalid file format: different number of fields '
                            'in the header rows of file %s (%d, %d, %d)' %
                            (streamID, len(names), len(types), len(specials)))

        # Verify standard file format
        for t in types:
            if not FieldMetaType.isValid(t):
                raise Exception(
                    'Invalid file format for "%s" - field type "%s" '
                    'not a valid FieldMetaType' % (
                        self._filename,
                        t,
                    ))

        for s in specials:
            if not FieldMetaSpecial.isValid(s):
                raise Exception(
                    'Invalid file format. \'%s\' is not a valid special '
                    'flag' % s)

        self._fields = [
            FieldMetaInfo(*attrs) for attrs in zip(names, types, specials)
        ]
        self._fieldCount = len(self._fields)

        # Keep track on how many records have been read/written
        self._recordCount = 0

        self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp) if
                              FieldMetaSpecial.timestamp in specials else None)
        self._resetIdx = (specials.index(FieldMetaSpecial.reset)
                          if FieldMetaSpecial.reset in specials else None)
        self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence) if
                               FieldMetaSpecial.sequence in specials else None)
        self._categoryIdx = (specials.index(FieldMetaSpecial.category) if
                             FieldMetaSpecial.category in specials else None)
        self._learningIdx = (specials.index(FieldMetaSpecial.learning) if
                             FieldMetaSpecial.learning in specials else None)

        # keep track of the current sequence
        self._currSequence = None
        self._currTime = None

        if self._timeStampIdx:
            assert types[self._timeStampIdx] == FieldMetaType.datetime
        if self._sequenceIdIdx:
            assert types[self._sequenceIdIdx] in (FieldMetaType.string,
                                                  FieldMetaType.integer)
        if self._resetIdx:
            assert types[self._resetIdx] == FieldMetaType.integer
        if self._categoryIdx:
            assert types[self._categoryIdx] in (FieldMetaType.list,
                                                FieldMetaType.integer)
        if self._learningIdx:
            assert types[self._learningIdx] == FieldMetaType.integer

        # Convert the types to the actual types in order to convert the strings
        if self._mode == self._FILE_READ_MODE:
            m = {
                FieldMetaType.integer: intOrNone,
                FieldMetaType.float: floatOrNone,
                FieldMetaType.boolean: parseBool,
                FieldMetaType.string: unescape,
                FieldMetaType.datetime: parseTimestamp,
                FieldMetaType.sdr: parseSdr,
                FieldMetaType.list: parseStringList
            }
        else:
            if includeMS:
                datetimeFunc = serializeTimestamp
            else:
                datetimeFunc = serializeTimestampNoMS
            m = {
                FieldMetaType.integer: str,
                FieldMetaType.float: str,
                FieldMetaType.string: escape,
                FieldMetaType.boolean: str,
                FieldMetaType.datetime: datetimeFunc,
                FieldMetaType.sdr: serializeSdr,
                FieldMetaType.list: stripList
            }

        self._adapters = [m[t] for t in types]

        self._missingValues = missingValues

        #
        # If the bookmark is set, we need to skip over first N records
        #
        if bookmark is not None:
            rowsToSkip = self._getStartRow(bookmark)
        elif firstRecord is not None:
            rowsToSkip = firstRecord
        else:
            rowsToSkip = 0

        while rowsToSkip > 0:
            self.next()
            rowsToSkip -= 1

        # Dictionary to store record statistics (min and max of scalars for now)
        self._stats = None
Пример #43
0
# set of utilities to interact with files

# @author: rm3086 (at) columbia (dot) edu

import csv, shutil, os, sys, glob
import _pickle as cPickle
import struct
platform_c_maxint = 2**(struct.Struct('i').size * 8 - 1) - 1
csv.field_size_limit(platform_c_maxint)
from .log import strd_logger

# logger
global log
log = strd_logger('file')


# check if a file exist
def file_exist(fname):
    try:
        open(fname, 'r')
        return True
    except IOError:
        return False


# create directory if not existing
def mkdir(dirname):
    try:
        os.makedirs(dirname)
    except OSError:
        pass
Пример #44
0
 def tearDown(self):
     # Resetting limit to avoid failure in other tests.
     csv.field_size_limit(self.lim)
     os.remove('.test.csv')
Пример #45
0
 def csvInit():
     csv.field_size_limit(1000000000)
     return 0
Пример #46
0
import optparse
import fileinput
import collections
import datetime
import time
"""
Essentially reverses the process of bundle-items.

Processes the CSV download from MTurk and bursts out multiple items in each HIT.
Each field name that ends in "_1", "_2" etc is assumed to be such a multiplexed field.
Any other fields will be repeated in the output.

Can produce JSON format rather than CSV if desired.
"""

csv.field_size_limit(10**6)

######################################################################


def maybeOpen(file, mode="r", encoding="utf8"):
    if type(file) is types.StringType:
        file = open(file, mode)
    if encoding:
        file = (mode == "r" and codecs.getreader
                or codecs.getwriter)(encoding)(file)
    return file


######################################################################
Пример #47
0
    def backup(self):
        """
            Backup the database to a local SQLite database

            @ToDo: Option to use a temporary DB in Postgres/MySQL as this takes
                   too long for a large DB
        """

        moves = self.moves
        news = self.news
        strints = self.strints
        strbools = self.strbools
        if not moves and not news and not strbools and not strints:
            # Nothing to backup
            return

        import os

        db = self.db
        folder = "%s/databases/backup" % current.request.folder

        # Create clean folder for the backup
        if os.path.exists(folder):
            import shutil
            shutil.rmtree(folder)
            import time
            time.sleep(1)
        os.mkdir(folder)

        # Setup backup database
        db_bak = DAL("sqlite://backup.db",
                     folder=folder,
                     adapter_args={"foreign_keys": False})

        # Copy Table structure
        skip = []
        for tablename in db.tables:
            if tablename == "gis_location":
                table = db[tablename]
                fields = [
                    table[field] for field in table.fields
                    if field != "the_geom"
                ]
                try:
                    db_bak.define_table(tablename, *fields)
                except KeyError:
                    # Can't resolve reference yet
                    # Cleanup
                    del db_bak[tablename]
                    # Try later
                    skip.append(tablename)
            else:
                try:
                    db_bak.define_table(tablename, db[tablename])
                except KeyError:
                    # Can't resolve reference yet
                    # Cleanup
                    del db_bak[tablename]
                    # Try later
                    skip.append(tablename)
        while skip:
            _skip = []
            for tablename in skip:
                if tablename == "gis_location":
                    table = db[tablename]
                    fields = [
                        table[field] for field in table.fields
                        if field != "the_geom"
                    ]
                    try:
                        db_bak.define_table(tablename, *fields)
                    except KeyError:
                        # Can't resolve reference yet
                        # Cleanup
                        del db_bak[tablename]
                        # Try later
                        _skip.append(tablename)
                    except:
                        import sys
                        print "Skipping %s: %s" % (tablename,
                                                   sys.exc_info()[1])
                else:
                    try:
                        db_bak.define_table(tablename, db[tablename])
                    except KeyError:
                        # Can't resolve reference yet
                        # Cleanup
                        del db_bak[tablename]
                        # Try later
                        _skip.append(tablename)
                    except:
                        import sys
                        print "Skipping %s: %s" % (tablename,
                                                   sys.exc_info()[1])
            skip = _skip

        # Which tables do we need to backup?
        tables = []
        if moves:
            for tablename in moves:
                tables.append(tablename)
        if news:
            for tablename in news:
                new = news[tablename]
                for t in new["tables"]:
                    tables.append(t)
                for s in new["supers"]:
                    tables.append(s)
                    stable = db[s]
                    rows = db(stable._id > 0).select(stable.instance_type)
                    instance_types = set([r.instance_type for r in rows])
                    for t in instance_types:
                        tables.append(t)
        if strbools:
            for tablename, fieldname in strints:
                tables.append(tablename)
        if strints:
            for tablename, fieldname in strints:
                tables.append(tablename)

        # Remove duplicates
        tables = set(tables)

        # Copy Data
        import csv
        csv.field_size_limit(2**20 * 100)  # 100 megs
        for tablename in tables:
            filename = "%s/%s.csv" % (folder, tablename)
            file = open(filename, "w")
            rows = db(db[tablename].id > 0).select()
            rows.export_to_csv_file(file)
            file.close()
            file = open(filename, "r")
            db_bak[tablename].import_from_csv_file(
                file, unique="uuid2")  # uuid2 designed to not hit!
            file.close()
            db_bak.commit()

        # Pass handle back to other functions
        self.db_bak = db_bak
Пример #48
0
    print('lda took: ', end - start, ' seconds')
    return lda, get_doc_topic(corpus, lda, id_to_bow), dictionary


def viz(lda, corpus, dictionary, html_fn):
    lda_visualization = pyLDAvis.gensim.prepare(lda,
                                                corpus,
                                                dictionary,
                                                sort_topics=False)
    pyLDAvis.save_html(lda_visualization, html_fn)


#main <----------------
k = int(sys.argv[2])
fn = sys.argv[1]
csv.field_size_limit(524288)
print("reading in text data...")
id_and_text = []
id_to_source = dict()
with open(fn, newline='') as data:

    reader = csv.reader(data)

    for line in reader:

        idn = line[0]
        text = line[1]
        id_to_source[idn] = line[2]

        text = text.replace('‘', '\'').replace('’', '\'').replace(
            '“',
Пример #49
0
class BasicProcessor(BasicWorker, metaclass=abc.ABCMeta):
    """
	Abstract post-processor class

	A post-processor takes a finished search query as input and processed its
	result in some way, with another result set as output. The input thus is
	a CSV file, and the output (usually) as well. In other words, the result of
	a post-processor run can be used as input for another post-processor
	(though whether and when this is useful is another question).
	"""
    db = None  # database handler
    dataset = None  # Dataset object representing the dataset to be created
    job = None  # Job object that requests the execution of this processor
    parent = None  # Dataset object to be processed, if applicable
    source_file = None  # path to dataset to be processed, if applicable

    description = "No description available"  # processor description, shown in web front-end
    category = "Other"  # processor category, for sorting in web front-end
    extension = "csv"  # extension of files created by this processor
    options = {}  # configurable options for this processor
    parameters = {}  # values for the processor's configurable options

    # Tumblr posts can overflow the regular limit, so double this.
    csv.field_size_limit(131072 * 2)

    def work(self):
        """
		Process a dataset

		Loads dataset metadata, sets up the scaffolding for performing some kind
		of processing on that dataset, and then processes it. Afterwards, clean
		up.
		"""
        try:
            self.dataset = DataSet(key=self.job.data["remote_id"], db=self.db)
        except TypeError:
            # query has been deleted in the meantime. finish without error,
            # as deleting it will have been a conscious choice by a user
            self.job.finish()
            return

        if self.dataset.data.get("key_parent", None):
            # search workers never have parents (for now), so we don't need to
            # find out what the parent dataset is if it's a search worker
            try:
                self.parent = DataSet(key=self.dataset.data["key_parent"],
                                      db=self.db)
            except TypeError:
                # we need to know what the parent dataset was to properly handle the
                # analysis
                self.log.warning(
                    "Processor %s queued for orphan query %s: cannot run, cancelling job"
                    % (self.type, self.dataset.key))
                self.job.finish()
                return

            if not self.parent.is_finished():
                # not finished yet - retry after a while
                self.job.release(delay=30)
                return

            self.parent = DataSet(key=self.dataset.data["key_parent"],
                                  db=self.db)

            self.source_file = self.parent.get_results_path()
            if not self.source_file.exists():
                self.dataset.update_status("Finished, no input data found.")

        self.log.info("Running post-processor %s on query %s" %
                      (self.type, self.job.data["remote_id"]))

        self.parameters = self.dataset.parameters
        self.dataset.update_status("Processing data")
        self.dataset.update_version(get_software_version())

        if self.interrupted:
            return self.abort()

        if not self.dataset.is_finished():
            try:
                self.process()
                self.after_process()
            except WorkerInterruptedException:
                self.abort()
            except Exception as e:
                frames = traceback.extract_tb(e.__traceback__)
                frames = [
                    frame.filename.split("/").pop() + ":" + str(frame.lineno)
                    for frame in frames[1:]
                ]
                location = "->".join(frames)

                # Not all datasets have parent keys
                if len(self.dataset.get_genealogy()) > 1:
                    parent_key = " (via " + self.dataset.get_genealogy(
                    )[0].key + ")"
                else:
                    parent_key = ""

                raise ProcessorException(
                    "Processor %s raised %s while processing dataset %s%s in %s:\n   %s\n"
                    % (self.type, e.__class__.__name__, self.dataset.key,
                       parent_key, location, str(e)))
        else:
            # dataset already finished, job shouldn't be open anymore
            self.log.warning(
                "Job %s/%s was queued for a dataset already marked as finished, deleting..."
                % (self.job.data["jobtype"], self.job.data["remote_id"]))
            self.job.finish()

    def after_process(self):
        """
		After processing, declare job finished
		"""
        if self.dataset.data["num_rows"] > 0:
            self.dataset.update_status("Dataset saved.")

        if not self.dataset.is_finished():
            self.dataset.finish()

        # see if we have anything else lined up to run next
        for next in self.parameters.get("next", []):
            next_parameters = next.get("parameters", {})
            next_type = next.get("type", "")
            available_processors = self.dataset.get_available_processors()

            # run it only if the post-processor is actually available for this query
            if next_type in available_processors:
                next_analysis = DataSet(
                    parameters=next_parameters,
                    type=next_type,
                    db=self.db,
                    parent=self.dataset.key,
                    extension=available_processors[next_type]["extension"])
                self.queue.add_job(next_type, remote_id=next_analysis.key)

        # see if we need to register the result somewhere
        if "copy_to" in self.parameters:
            # copy the results to an arbitrary place that was passed
            if self.dataset.get_results_path().exists():
                shutil.copyfile(str(self.dataset.get_results_path()),
                                self.parameters["copy_to"])
            else:
                # if copy_to was passed, that means it's important that this
                # file exists somewhere, so we create it as an empty file
                with open(self.parameters["copy_to"], "w") as empty_file:
                    empty_file.write("")

        # see if this query chain is to be attached to another query
        # if so, the full genealogy of this query (minus the original dataset)
        # is attached to the given query - this is mostly useful for presets,
        # where a chain of processors can be marked as 'underlying' a preset
        if "attach_to" in self.parameters:
            try:
                # copy metadata and results to the surrogate
                surrogate = DataSet(key=self.parameters["attach_to"],
                                    db=self.db)

                if self.dataset.get_results_path().exists():
                    shutil.copyfile(str(self.dataset.get_results_path()),
                                    str(surrogate.get_results_path()))

                top_parent = self.dataset.get_genealogy()[1]
                top_parent.link_parent(surrogate.key)

                try:
                    surrogate.finish(self.dataset.data["num_rows"])
                except RuntimeError:
                    # already finished, could happen (though it shouldn't)
                    pass

                surrogate.update_status(self.dataset.get_status())

            except ValueError:
                # dataset with key to attach to doesn't exist...
                self.log.warning(
                    "Cannot attach dataset chain containing %s to %s (dataset does not exist)"
                    % (self.dataset.key, self.parameters["attach_to"]))

        self.job.finish()

    def abort(self):
        """
		Abort dataset creation and clean up so it may be attempted again later
		"""
        # remove any result files that have been created so far
        if self.dataset.get_results_path().exists():
            os.unlink(str(self.dataset.get_results_path()))

        if self.dataset.get_temporary_path().exists():
            shutil.rmtree(str(self.dataset.get_temporary_path()))

        # we release instead of finish, since interrupting is just that - the
        # job should resume at a later point. Delay resuming by 10 seconds to
        # give 4CAT the time to do whatever it wants (though usually this isn't
        # needed since restarting also stops the spawning of new workers)
        self.dataset.update_status(
            "Dataset processing interrupted. Retrying later.")

        if self.interrupted == self.INTERRUPT_RETRY:
            # retry later - wait at least 10 seconds to give the backend time to shut down
            self.job.release(delay=10)
        elif self.interrupted == self.INTERRUPT_CANCEL:
            # cancel job
            self.job.finish()

    def iterate_csv_items(self, path):
        """
		A generator that iterates through a CSV file

		With every iteration, the processor's 'interrupted' flag is checked,
		and if set a ProcessorInterruptedException is raised, which by default
		is caught and subsequently stops execution gracefully.

		:param Path path:  Path to csv file to read
		:return:
		"""
        with open(path, encoding="utf-8") as input:
            reader = csv.DictReader(input)

            for item in reader:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Processor interrupted while iterating through CSV file"
                    )

                yield item

    def write_csv_items_and_finish(self, data):
        """
		Write data as csv to results file and finish dataset

		Determines result file path using dataset's path determination helper
		methods. After writing results, the dataset is marked finished. Will
		raise a ProcessorInterruptedException if the interrupted flag for this
		processor is set while iterating.

		:param data: A list or tuple of dictionaries, all with the same keys
		"""
        if not (isinstance(data, typing.List)
                or isinstance(data, typing.Tuple)) or isinstance(data, str):
            raise TypeError(
                "write_csv_items requires a list or tuple of dictionaries as argument"
            )

        if not data:
            raise ValueError(
                "write_csv_items requires a dictionary with at least one item")

        if not isinstance(data[0], dict):
            raise TypeError(
                "write_csv_items requires a list or tuple of dictionaries as argument"
            )

        self.dataset.update_status("Writing results file")
        with self.dataset.get_results_path().open("w",
                                                  encoding="utf-8",
                                                  newline='') as results:
            writer = csv.DictWriter(results, fieldnames=data[0].keys())
            writer.writeheader()

            for row in data:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while writing results file")
                writer.writerow(row)

        self.dataset.update_status("Finished")
        self.dataset.finish(len(data))

    def is_filter(self):
        """
		Is this processor a filter?

		Filters do not produce their own dataset but replace the parent dataset
		instead.

		:todo: Make this a bit more robust than sniffing the processor category
		:return bool:
		"""
        return hasattr(
            self, "category"
        ) and self.category and "filter" in self.category.lower()

    @abc.abstractmethod
    def process(self):
        """
		Process data

		To be defined by the child processor.
		"""
        pass
Пример #50
0
from collections import defaultdict
import codecs

import csv
from nltk.tokenize import RegexpTokenizer
import numpy as np
import tensorflow as tf
import argparse

import model.data

seed = 42
np.random.seed(seed)
tf.set_random_seed(seed)

csv.field_size_limit(2**28)
tokenizer = RegexpTokenizer(r'\w+')
cachedStopWords = stopwords.words("english")


home_dir = os.getenv("HOME")


def loadGloveModel(gloveFile=None, hidden_size=None):
    if gloveFile is None:
        if hidden_size == 50:
            gloveFile = os.path.join(home_dir, "resources/pretrained_embeddings/glove.6B.50d.txt")
        elif hidden_size == 100:
            gloveFile = os.path.join(home_dir, "resources/pretrained_embeddings/glove.6B.100d.txt")
        elif hidden_size == 200:
            gloveFile = os.path.join(home_dir, "resources/pretrained_embeddings/glove.6B.200d.txt")
Пример #51
0
if __name__ == "__main__":
    csvSet = {
        "100": "./dataset/data_100.csv",
        "8000": "./dataset/data_8000.csv",
        "4000": "./dataset/data_4000.csv",
        "articles1": "./dataset/articles1.csv",
        "articles2": "./dataset/articles2.csv",
        "articles3": "./dataset/articles3.csv"
    }
    csvPath = csvSet[sys.argv[1]]
    connection = pika.BlockingConnection(
        pika.ConnectionParameters('localhost'))
    channel = connection.channel()
    # queue_name = ["queue_coba", "queue_1", "queue_2", "queue_3"]
    queue_name = sys.argv[2:]
    i = 0
    with open(csvPath) as csvFile:
        csv.field_size_limit(10000000)
        csvReader = csv.DictReader(csvFile)
        for row in csvReader:
            jsonData = {}
            _id = row["id"]
            content = unicode(row["content"], errors="ignore")
            jsonData["id"] = _id
            jsonData["content"] = content
            produce(queue_name[i], jsonData)
            i += 1
            if i > len(queue_name) - 1:
                i = 0
    pass
import csv
import ctypes
from os import cpu_count
from os.path import dirname, join
import sys

from tangentcft.TangentS.math_tan import latex_mml
from tangentcft.TangentS.math_tan.mathml import MathML

CSV_PARAMETERS = {
    'delimiter': '\t',
    'quotechar': '"',
    'quoting': csv.QUOTE_MINIMAL,
}
csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))

sys.setrecursionlimit(15000)

ARQMATH_INPUT_DATA_DIRNAME = '/mnt/storage/ARQMath_CLEF2020'
ARQMATH_OUTPUT_DATA_DIRNAME = 'output_data/ARQMath_CLEF2020'

ARQMATH_COLLECTION_INPUT_DATA_DIRNAME = '{}/Collection'.format(
    ARQMATH_INPUT_DATA_DIRNAME)
ARQMATH_COLLECTION_OUTPUT_DATA_DIRNAME = '{}/Collection'.format(
    ARQMATH_OUTPUT_DATA_DIRNAME)

ARQMATH_COLLECTION_QRELS_FILENAME = '{}/votes-qrels.V1.2.tsv'.format(
    ARQMATH_COLLECTION_OUTPUT_DATA_DIRNAME)

ARQMATH_COLLECTION_POSTS_LATEX_FILENAME = '{}/Posts.V1.2_latex.json.gz'.format(
Пример #53
0
  def __init__(self, streamID, write=False, fields=None, missingValues=None,
               bookmark=None, includeMS=True, firstRecord=None):
    """ Constructor
    
    streamID:
        CSV file name, input or output
    write:
        True or False, open for writing if True
    fields:
        a list of nupic.data.fieldmeta.FieldMetaInfo field descriptors, only
        applicable when write==True
    missingValues:
        what missing values should be replaced with?
    bookmark:
        a reference to the previous reader, if passed in, the records will be
        returned starting from the point where bookmark was requested. Either 
        bookmark or firstRecord can be specified, not both. If bookmark is used, 
        then firstRecord MUST be None. 
    includeMS:
        If false, the microseconds portion is not included in the
        generated output file timestamp fields. This makes it compatible
        with reading in from Excel.
    firstRecord: 
        0-based index of the first record to start reading from. Either bookmark
        or firstRecord can be specified, not both. If bookmark is used, then
        firstRecord MUST be None. 

    Each field is a 3-tuple (name, type, special or '')

    The name is the name of the field. The type is one of: 'string', 'datetime',
    'int', 'float', 'bool' The special is either empty or one of S, R, T, C that
    designate their field as the sequenceId, reset, timestamp, or category.
    There can be at most one of each. There may be multiple fields of type
    datetime, but no more than one of them may be the timestamp field (T). The
    sequence id field must be either a string or an int. The reset field must be
    an int (and must contain 0 or 1). The category field must be an int.

    The FileRecordStream iterates over the field names, types and specials and
    stores the information.
    """
    
    # Call superclass constructor
    super(FileRecordStream, self).__init__()

    # Only bookmark or firstRow can be specified, not both
    if bookmark is not None and firstRecord is not None:
      raise RuntimeError("Only bookmark or firstRecord can be specified, not both")

    if fields is None:
      fields = []
    if missingValues is None:
      missingValues = ['']
    
    # We'll be operating on csvs with arbitrarily long fields
    size = 2**27
    csv.field_size_limit(size)

    self._filename = streamID
    # We can't guarantee what system files are coming from, use universal
    # newlines
    self._write = write
    self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE
    self._file = open(self._filename, self._mode)
    self._sequences = set()
    self.rewindAtEOF = False
    
    if write:
      assert fields is not None
      assert isinstance(fields, (tuple, list))
      # Verify all fields are 3-tuple
      assert all(isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3
                 for f in fields)
      names, types, specials = zip(*fields)
      self._writer = csv.writer(self._file)
    else:
      os.linesep = '\n' # make sure readline() works on windows too.
      # Read header lines
      self._reader = csv.reader(self._file, dialect='excel', quoting=csv.QUOTE_NONE)
      try:
        names = [n.strip() for n in self._reader.next()]
      except:
        raise Exception('The header line of the file %s contained a NULL byte' \
                        % self._filename)
      types = [t.strip() for t in self._reader.next()]
      specials = [s.strip() for s in self._reader.next()]

      # If there are no specials, this means there was a blank line
      if len(specials) == 0:
        specials=[""]

    if not(len(names) == len(types) == len(specials)):
      raise Exception('Invalid file format: different number of fields '
                      'in the header rows of file %s (%d, %d, %d)' %
                      (streamID, len(names), len(types), len(specials)))

    # Verify standard file format
    allowedTypes = ('string', 'datetime', 'int', 'float', 'bool')
    for i, t in enumerate(types):
      # This is a temporary hack for the Precog milestone, which passes in a
      # type 'address' for address fields. Here we simply map the type "address"
      # to "string".
      if t == 'address':
        types[i] = 'string'
        t = 'string'

      if t not in allowedTypes:
        raise Exception('Invalid file format for "%s" - field type "%s" '
                        'not one of %s ' % (self._filename, t, allowedTypes))

    for s in specials:
      if s not in ('', 'T', 'R', 'S', 'C', 'L'):
        raise Exception('Invalid file format. \'%s\' is not a valid special '
                        'flag' % s)

    self._fields = [FieldMetaInfo(*attrs)
                    for attrs in zip(names, types, specials)]
    self._fieldCount = len(self._fields)

    # Keep track on how many records have been read/written
    self._recordCount = 0

    self._timeStampIdx = specials.index('T') if 'T' in specials else None
    self._resetIdx = specials.index('R') if 'R' in specials else None
    self._sequenceIdIdx = specials.index('S') if 'S' in specials else None
    self._categoryIdx = specials.index('C') if 'C' in specials else None
    self._learningIdx = specials.index('L') if 'L' in specials else None

    # keep track of the current sequence
    self._currSequence = None
    self._currTime = None

    if self._timeStampIdx:
      assert types[self._timeStampIdx] == 'datetime'
    if self._sequenceIdIdx:
      assert types[self._sequenceIdIdx] in ('string', 'int')
    if self._resetIdx:
      assert types[self._resetIdx] == 'int'
    if self._categoryIdx:
      assert types[self._categoryIdx] == 'int'
    if self._learningIdx:
      assert types[self._learningIdx] == 'int'

    # Convert the types to the actual types in order to convert the strings
    if self._mode == self._FILE_READ_MODE:
      m = dict(int=intOrNone,
               float=floatOrNone,
               bool=parseBool,
               string=unescape,
               datetime=parseTimestamp)
    else:
      if includeMS:
        datetimeFunc = serializeTimestamp
      else:
        datetimeFunc = serializeTimestampNoMS
      m = dict(int=str,
               float=str,
               string=escape,
               bool=str,
               datetime=datetimeFunc)

    self._adapters = [m[t] for t in types]

    self._missingValues = missingValues

    #
    # If the bookmark is set, we need to skip over first N records
    #
    if bookmark is not None:
      rowsToSkip = self._getStartRow(bookmark)
    elif firstRecord is not None:
      rowsToSkip = firstRecord
    else:
      rowsToSkip = 0
      
    while rowsToSkip > 0:
      self.next()
      rowsToSkip -= 1


    # Dictionary to store record statistics (min and max of scalars for now)
    self._stats = None
Пример #54
0
import os
import io
import csv
import ast
import sys
import math
import struct
from enum import Enum
from exceptions import CSVError, SchemaError

csv.field_size_limit(sys.maxsize)  # Don't limit the size of user input fields.


class Type(Enum):
    UNKNOWN = 0
    BOOL = 1
    DOUBLE = 2
    FLOAT = 2  # alias to DOUBLE
    STRING = 3
    LONG = 4
    INT = 4  # alias to LONG
    INTEGER = 4  # alias to LONG
    ARRAY = 5
    ID = 6
    START_ID = 7
    END_ID = 8
    IGNORE = 9


def convert_schema_type(in_type):
    try:
Пример #55
0
#!/usr/bin/env python
import sys
import csv
csv.field_size_limit(sys.maxsize)  # make sure we can write very large csv fields
import os
import argparse
import colored_traceback.always

# if you move this script, you'll need to change this method of getting the imports
partis_dir = os.path.dirname(os.path.realpath(__file__)).replace('/bin', '')
sys.path.insert(1, partis_dir + '/python')

import utils
import glutils
from clusterpath import ClusterPath

helpstr = """
Script to extract sequences from a partis output file and write them to a fasta file.
For details of partis output files, see the manual.
Example usage:
    ./bin/extract-fasta.py --input-file partis-output.yaml --fasta-output-file out.fa  # extact all sequences from best partition in <partis-output.yaml>
"""
class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
    pass
formatter_class = MultiplyInheritedFormatter
parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter, description=helpstr)
parser.add_argument('--input-file', required=True, help='partis output file')
parser.add_argument('--fasta-output-file', required=True, help='output fasta file name')
parser.add_argument('--partition-index', type=int, help='if set, use the partition at this index in the cluster path, rather than the default of using the best partition')
parser.add_argument('--seed-unique-id', help='if set, take sequences only from the cluster containing this seed sequence, rather than the default of taking all sequences from all clusters')
parser.add_argument('--cluster-index', type=int, help='if set, take sequences only from the cluster at this index in the partition, rather than the default of taking all sequences from all clusters')
class DSVParser(interface.FileObjectParser):
  """Delimiter separated values (DSV) parser interface."""

  # A list that contains the names of all the fields in the log file. This
  # needs to be defined by each DSV parser.
  COLUMNS = []

  # The default delimiter is a comma, but a tab, pipe or other character are
  # known to be used. Note the delimiter must be a byte string otherwise csv
  # module can raise a TypeError indicating that "delimiter" must be a single
  # character string.
  DELIMITER = b','

  # If there is a header before the lines start it can be defined here, and
  # the number of header lines that need to be skipped before the parsing
  # starts.
  NUMBER_OF_HEADER_LINES = 0

  # If there is a special quote character used inside the structured text
  # it can be defined here.
  QUOTE_CHAR = b'"'

  # The maximum size of a single field in the parser
  FIELD_SIZE_LIMIT = csv.field_size_limit()

  # Value that should not appear inside the file, made to test the actual
  # file to see if it confirms to standards.
  _MAGIC_TEST_STRING = b'RegnThvotturMeistarans'

  def __init__(self, encoding=None):
    """Initializes a delimiter separated values (DSV) parser.

    Args:
      encoding (Optional[str]): encoding used in the DSV file, where None
          indicates the codepage of the parser mediator should be used.
    """
    super(DSVParser, self).__init__()
    self._encoding = encoding
    if py2to3.PY_2:
      self._end_of_line = b'\n'
    else:
      self._end_of_line = '\n'
    self._maximum_line_length = (
        len(self._end_of_line) +
        len(self.COLUMNS) * (self.FIELD_SIZE_LIMIT + len(self.DELIMITER)))

  def _ConvertRowToUnicode(self, parser_mediator, row):
    """Converts all strings in a DSV row dict to Unicode.

    Args:
      parser_mediator (ParserMediator): mediates interactions between parsers
          and other components, such as storage and dfvfs.
      row (dict[str, bytes]): a row from a DSV file, where the dictionary
          key contains the column name and the value a binary string.

    Returns:
      dict[str, str]: a row from the DSV file, where the dictionary key
          contains the column name and the value a Unicode string.
    """
    for key, value in iter(row.items()):
      if isinstance(value, py2to3.UNICODE_TYPE):
        continue

      try:
        row[key] = value.decode(self._encoding)
      except UnicodeDecodeError:
        replaced_value = value.decode(self._encoding, errors='replace')
        parser_mediator.ProduceExtractionWarning(
            'error decoding DSV value: {0:s} as {1:s}, characters have been '
            'replaced in {2:s}'.format(key, self._encoding, replaced_value))
        row[key] = replaced_value

    return row

  def _CreateDictReader(self, line_reader):
    """Returns a reader that processes each row and yields dictionaries.

    csv.DictReader does this job well for single-character delimiters; parsers
    that need multi-character delimiters need to override this method.

    Args:
      line_reader (iter): yields lines from a file-like object.

    Returns:
      iter: a reader of dictionaries, as returned by csv.DictReader().
    """
    delimiter = self.DELIMITER
    quotechar = self.QUOTE_CHAR
    magic_test_string = self._MAGIC_TEST_STRING
    # Python 3 csv module requires arguments to constructor to be of type str.
    if py2to3.PY_3:
      delimiter = delimiter.decode(self._encoding)
      quotechar = quotechar.decode(self._encoding)
      magic_test_string = magic_test_string.decode(self._encoding)

    return csv.DictReader(
        line_reader, delimiter=delimiter, fieldnames=self.COLUMNS,
        quotechar=quotechar, restkey=magic_test_string,
        restval=magic_test_string)

  # pylint: disable=missing-return-type-doc
  def _CreateLineReader(self, file_object):
    """Creates an object that reads lines from a text file.

    The line reader is advanced to the beginning of the DSV content, skipping
    any header lines.

    Args:
      file_object (dfvfs.FileIO): file-like object.

    Returns:
      TextFile|BinaryLineReader: an object that implements an iterator
          over lines in a text file.

    Raises:
      UnicodeDecodeError: if the file cannot be read with the specified
          encoding.
    """
    # The Python 2 csv module reads bytes and the Python 3 csv module Unicode
    # reads strings.
    if py2to3.PY_3:
      line_reader = text_file.TextFile(
          file_object, encoding=self._encoding, end_of_line=self._end_of_line)

      # pylint: disable=protected-access
      maximum_read_buffer_size = line_reader._MAXIMUM_READ_BUFFER_SIZE

    else:
      line_reader = line_reader_file.BinaryLineReader(
          file_object, end_of_line=self._end_of_line)

      maximum_read_buffer_size = line_reader.MAXIMUM_READ_BUFFER_SIZE

    # Line length is one less than the maximum read buffer size so that we
    # tell if there's a line that doesn't end at the end before the end of
    # the file.
    if self._maximum_line_length > maximum_read_buffer_size:
      self._maximum_line_length = maximum_read_buffer_size - 1

    # If we specifically define a number of lines we should skip, do that here.
    for _ in range(0, self.NUMBER_OF_HEADER_LINES):
      line_reader.readline(self._maximum_line_length)
    return line_reader

  def _HasExpectedLineLength(self, file_object):
    """Determines if a file begins with lines of the expected length.

    As we know the maximum length of valid lines in the DSV file, the presence
    of lines longer than this indicates that the file will not be parsed
    successfully, without reading excessive data from a large file.

    Args:
      file_object (dfvfs.FileIO): file-like object.

    Returns:
      bool: True if the file has lines of the expected length.
    """
    original_file_position = file_object.tell()
    line_reader = self._CreateLineReader(file_object)
    for _ in range(0, 20):
      # Attempt to read a line that is longer than any line that should be in
      # the file.
      sample_line = line_reader.readline(self._maximum_line_length + 1)
      if len(sample_line) > self._maximum_line_length:
        file_object.seek(original_file_position)
        return False
    file_object.seek(original_file_position)
    return True

  @classmethod
  def GetFormatSpecification(cls):
    """Retrieves the format specification.

    Returns:
      FormatSpecification: format specification.
    """
    return specification.FormatSpecification(cls.NAME, text_format=True)

  def ParseFileObject(self, parser_mediator, file_object):
    """Parses a DSV text file-like object.

    Args:
      parser_mediator (ParserMediator): mediates interactions between parsers
          and other components, such as storage and dfvfs.
      file_object (dfvfs.FileIO): file-like object.

    Raises:
      UnableToParseFile: when the file cannot be parsed.
    """
    # TODO: Replace this with detection of the file encoding via byte-order
    # marks. Also see: https://github.com/log2timeline/plaso/issues/1971
    if not self._encoding:
      self._encoding = parser_mediator.codepage

    try:
      if not self._HasExpectedLineLength(file_object):
        display_name = parser_mediator.GetDisplayName()
        raise errors.UnableToParseFile((
            '[{0:s}] Unable to parse DSV file: {1:s} with error: '
            'unexpected line length.').format(self.NAME, display_name))
    except UnicodeDecodeError as exception:
      display_name = parser_mediator.GetDisplayName()
      raise errors.UnableToParseFile(
          '[{0:s}] Unable to parse DSV file: {1:s} with error: {2!s}.'.format(
              self.NAME, display_name, exception))

    try:
      line_reader = self._CreateLineReader(file_object)
      reader = self._CreateDictReader(line_reader)
      row_offset = line_reader.tell()
      row = next(reader)
    except (StopIteration, csv.Error, UnicodeDecodeError) as exception:
      display_name = parser_mediator.GetDisplayName()
      raise errors.UnableToParseFile(
          '[{0:s}] Unable to parse DSV file: {1:s} with error: {2!s}.'.format(
              self.NAME, display_name, exception))

    number_of_columns = len(self.COLUMNS)
    number_of_records = len(row)

    if number_of_records != number_of_columns:
      display_name = parser_mediator.GetDisplayName()
      raise errors.UnableToParseFile((
          '[{0:s}] Unable to parse DSV file: {1:s}. Wrong number of '
          'records (expected: {2:d}, got: {3:d})').format(
              self.NAME, display_name, number_of_columns,
              number_of_records))

    for key, value in row.items():
      if self._MAGIC_TEST_STRING in (key, value):
        display_name = parser_mediator.GetDisplayName()
        raise errors.UnableToParseFile((
            '[{0:s}] Unable to parse DSV file: {1:s}. Signature '
            'mismatch.').format(self.NAME, display_name))

    row = self._ConvertRowToUnicode(parser_mediator, row)

    if not self.VerifyRow(parser_mediator, row):
      display_name = parser_mediator.GetDisplayName()
      raise errors.UnableToParseFile((
          '[{0:s}] Unable to parse DSV file: {1:s}. Verification '
          'failed.').format(self.NAME, display_name))

    self.ParseRow(parser_mediator, row_offset, row)
    row_offset = line_reader.tell()

    for row in reader:
      if parser_mediator.abort:
        break
      row = self._ConvertRowToUnicode(parser_mediator, row)
      self.ParseRow(parser_mediator, row_offset, row)
      row_offset = line_reader.tell()

  @abc.abstractmethod
  def ParseRow(self, parser_mediator, row_offset, row):
    """Parses a line of the log file and produces events.

    Args:
      parser_mediator (ParserMediator): mediates interactions between parsers
          and other components, such as storage and dfvfs.
      row_offset (int): offset of the row.
      row (dict[str, str]): fields of a single row, as specified in COLUMNS.
    """

  # pylint: disable=redundant-returns-doc
  @abc.abstractmethod
  def VerifyRow(self, parser_mediator, row):
    """Verifies if a line of the file is in the expected format.
Пример #57
0
from scipy import sparse
import os
import spacy
import torch
from torchtext import data, datasets
from torchtext.vocab import Vectors
from torch.nn import init
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext.vocab as vocab
import torch.optim as optim
import sys
import csv

csv.field_size_limit(20000001)
base_dir = "/content/drive/My Drive/Colab Notebooks/"
LABEL = data.LabelField()
#LENGTH = data.Field(use_vocab=False,dtype=torch.long)
creative_id_TEXT = data.Field(sequential=True,
                              lower=True,
                              include_lengths=True,
                              fix_length=100)
advertiser_id_TEXT = data.Field(sequential=True,
                                lower=True,
                                include_lengths=True,
                                fix_length=100)
ad_id_TEXT = data.Field(sequential=True,
                        lower=True,
                        include_lengths=True,
                        fix_length=100)
import collections
import sys
import csv
from collections import defaultdict
from collections import Counter

csv.field_size_limit(
    sys.maxsize
)  #To expand the limit. The limit is 131072 lines. Our data is over 233000.

###### take transitions data
columns = defaultdict(list)  #each vlaue in each column is appended to a list
with open('user_transitions.csv', "r") as file:  #open data file
    reader = csv.reader(file, delimiter='|',
                        quotechar='"')  #read rows into a dictionary format
    next(reader)  # skip header
    for row in reader:  #read a row as {column1 : value1, column2: value2, ...}
        #print(row)
        for i, v in enumerate(row):  #go over each column name and value
            columns[i].append(
                v
            )  #append the value into the appropriate listbased on column name k

tran_id = columns[0]
#print(tran_id[0])
tran_uuid = columns[1]
tran_from_url = columns[2]
tran_to_url = columns[3]
tran_cookie_id = columns[4]
tran_from_material_model_id = columns[5]
tran_to_material_model_id = columns[6]
Пример #59
0
import sys, csv, json, wppbatchlib

csv.field_size_limit(min(2147483647, sys.maxsize))

VERSION = '0.1'
AUTHOR = 'Trevor Anderson <*****@*****.**>'

iFilePath = None
resultsFilePath = None

if sys.argv == None or len(sys.argv) != 2 or len(
        sys.argv[1]) < 5 or sys.argv[1][-14:] != 'rawresults.csv':
    print 'Drop a CSV file containing raw JSON results onto this program to use it.'
    print '(you need to run SearchPhone.bat first)'
    var = raw_input("Hit enter to quit")
    quit()

iFilePath = sys.argv[1]
resultsFilePath = sys.argv[1][:-15] + '_results.csv'
print 'Extracting Phone Intelligence results from ' + str(iFilePath)

csvReader = csv.reader(open(iFilePath, 'rbU'), delimiter=',', quotechar='"')
csvWriter = csv.writer(open(resultsFilePath, 'wb'),
                       delimiter=',',
                       quotechar='"')

rowNum = 0
for row in csvReader:
    #each raw results row will contain the original input file row, followed by the API URL,
    #followed by the JSON response.
    rowNum += 1
Пример #60
0
""" A Fast, Offline Reverse Geocoder in Python

A Python library for offline reverse geocoding. It improves on an existing library
called reverse_geocode developed by Richard Penman.
"""
from __future__ import print_function

__author__ = 'Ajay Thampi'
import os
import sys
import csv
if sys.platform == 'win32':
    # Windows C long is 32 bits, and the Python int is too large to fit inside.
    # Use the limit appropriate for a 32-bit integer as the max file size
    csv.field_size_limit(2**31 - 1)
else:
    csv.field_size_limit(sys.maxsize)
import zipfile
from scipy.spatial import cKDTree as KDTree
from reverse_geocoder import cKDTree_MP as KDTree_MP
import numpy as np

GN_URL = 'http://download.geonames.org/export/dump/'
GN_CITIES1000 = 'cities1000'
GN_ADMIN1 = 'admin1CodesASCII.txt'
GN_ADMIN2 = 'admin2Codes.txt'

# Schema of the GeoNames Cities with Population > 1000
GN_COLUMNS = {
    'geoNameId': 0,
    'name': 1,