示例#1
0
	def loadGenderList(gender, country, dataPath, hasHeader):
		fd = open(os.path.join(dataPath, '%s%sUTF8.csv' % (country, gender)), 'rb')
		reader = UnicodeReader(fd)
		names = {}
		if hasHeader:
			unused_header = reader.next()
		'''Load names as-is, but lower cased'''
		for row in reader:
			name = row[0].lower()
			try:
				'''The second column should be the count
				(number of babies in some year with this name)'''
				count = row[1]
			except:
				'''If second column does not exist, default to count=1'''
				count = 1
				if names.has_key(name):
					'''If here then I've seen this name before, modulo case.
					Only count once (there is no frequency information anyway)'''
					count = 0
			if names.has_key(name):
				names[name] += count
			else:
				names[name] = count
		fd.close()
		
		'''Add versions without diacritics'''
		for name in names.keys():
			dname = unidecode(name)
			if not names.has_key(dname):
				names[dname] = names[name]

		return names
示例#2
0
def toDB(file_path, server_name, db_name, table_name, user, password, csv_delimiter):

	folder, file = os.path.split(file_path)
	with Util.cd(folder):
		reader = UnicodeReader(open(file,'rb'))
		db = Database(db_name, user, server_name, 5432, password)
		csv_delimiter = csv_delimiter if csv_delimiter != "" else ";"
		fields, values = getInsertCommand(reader.next()[0].split(csv_delimiter), table_name)

		print 'running...'
		v = []
		while True:
			try:
				row = reader.next()[0].split(',')
			except Exception:
				break;

			for i in range(len(row)):
				if row[i] == 'None':
					row[i] = ''

			v.append(tuple(row))
			
		args_str = ','.join(db.cur.mogrify(values, x) for x in tuple(v))
		db.cur.execute("INSERT INTO " + table_name + " " + fields + " VALUES " + args_str)
		# db.cur.executemany(insert_command, tuple(v))
		db.conn.commit()

		db.conn.close()
	
	print 'done!'
def load_acceptance_ratio():
  session = SessionFactory.get_session()
  print 'Loading acceptance ratios:'
  f = open(os.path.join(DATA_PATH, 'numSubmissions.csv'), "rb")
  reader = UnicodeReader(f)
  header = reader.next()
  subm = {}
  for row in reader:
      year = int(row[0])
      for idx,val in enumerate(row[1:]):
          conf = header[idx+1]
          try:
              count = int(val)
              if conf not in subm.keys():
                  subm[conf] = {}
              subm[conf][year] = count
          except:
              pass


  for acronym, name, impact in CONFERENCES:
      print acronym.upper()
      conference = session.query(Venue).\
              filter_by(acronym=acronym.upper()).\
              one()

      for (year,count) in subm[acronym.upper()].items():
          numSubm = SubmissionsCount(year, count)
          numSubm.venue = conference
          session.add(numSubm)
示例#4
0
    def __init__(self, f):
        reader = UnicodeReader(f)
        self.content = {}
        i = 0
        for row in reader:
            line = []
            for r in row:
                line.append(r)
            self.content[i] = line
            i += 1

        self.randomKeys = self.content.keys()
        random.shuffle(self.randomKeys)
示例#5
0
idx = 0
for row in f.readlines():
    if idx >= 362:
        nrow = rewrite(row)
    else:
        nrow = row
    idx += 1
    g.write(nrow)

f.close()
g.close()

'''Read the data into a Python dictionary'''

f = open(os.path.join(dataPath, 'nameLists', 'nam_dict2.txt'), 'rb')
reader = UnicodeReader(f)

genderDict = {}

idx = 0

shortNames = []
for row in reader:
    if idx > 361:
        text = row[0]
        mf = text[:2].strip()  # M,1M,?M, F,1F,?F, ?, =
        #  =  <short_name> <long_name>
        name = text[2:29].lower().strip()
        sortingFlag = text[29]  # +,-; ignore +
        frequencies = text[30:-2]
示例#6
0
def main():

    data = "../resources/SOusers-Mar13.csv"  # File containing SO user dump
    results = "../resources/features3.csv"  # File where features will be stored
    picPath = "../resources/SOpictures/"  # Directory where pictures will be downloaded

    fr = open(os.path.join(data), 'rb')
    fw = open(os.path.join(results), 'ab')

    if _RANDOM:
        reader = RandomReader(fr)
    else:
        reader = UnicodeReader(fr)

    writer = UnicodeWriter(fw)

    queue = Queue()
    if _FACE:
        faceDetector = FaceDetector()

    threads = []
    SOhashes = {}  # Dictionary of user's hashes

    # Use multiple threads to download and get information
    for i in xrange(10):
        threads.append(Downloader(queue))
        threads[-1].start()

    idx = 0
    size = 4500  # Number of subjects

    for row in reader:
        if idx < size:
            so_uid = row[0]
            so_hash = row[2]
            if (not (SOhashes.has_key(so_hash))):
                SOhashes[so_hash] = so_uid
                if (not isDefaultGravatarPic(so_hash)):
                    data = [so_uid]
                    if _VISUAL_FEATURES:

                        # Download picture
                        filepath = os.path.join('%s%d.jpg' %
                                                (picPath, int(so_uid)))
                        if not os.path.isfile(filepath):
                            queue.put(
                                ('http://www.gravatar.com/avatar/%s' % so_hash,
                                 filepath))
                            time.sleep(2)

                        # Load picture
                        pic = picUtils.loadPicture(filepath)

                        if _FACE:
                            if faceDetector.isFrontFace(
                                    pic) or faceDetector.isProfileFace(pic):
                                data.append(str(True))
                            else:
                                data.append(str(False))

                        if _MOST_COMMON_COLORS:
                            _, f1, _, f2 = picUtils.mostCommonColor(pic)
                            data.append(str(f1 + f2))

                        if _NBCOLORS:
                            data.append(str(picUtils.getNbOfColors(pic)))

                        if _FARTHEST_NEIGHBOR:
                            F1 = picUtils.farthestNeighborMetric(pic, 10)
                            F2 = picUtils.farthestNeighborMetric(pic, 200)
                            data.append(str(F1))
                            data.append(str(F2))
                            if F1 != 0:
                                data.append(str(F2 / F1))
                            else:
                                data.append('?')

                        if _AVERAGE_SATURATION:
                            data.append(str(picUtils.avgSaturation(pic)))

                        if _THRESHOLD_BRIGHTNESS:
                            data.append(str(picUtils.threBrightness(pic, 0.2)))

                    if _GOOGLE:
                        gi = GoogleImage('http://www.gravatar.com/avatar/%s' %
                                         so_hash)
                        bestGuess = gi.getBestGuess()
                        if bestGuess:
                            bestGuess = bestGuess.encode('utf8')
                            data.append(bestGuess)
                            if _WIKIPEDIA:
                                gs = GoogleSearch("%s site:en.wikipedia.org" %
                                                  bestGuess)
                                wikiTitlePage = gs.getWikipediaTitlePage()
                                if wikiTitlePage:
                                    wiki = Wikipedia(wikiTitlePage)
                                    wiki.categoryGraph(4)
                                    nbCats = 10
                                    i = 0
                                    cats = wiki.sortGraphByDegree()
                                    while i < nbCats and i < len(cats):
                                        data.append(str(cats[i]))
                                        i += 1

                    # Write all information collected in the csv file
                    try:
                        print data
                        writer.writerow(data)
                        idx += 1
                    except:
                        print "Error with data"
        else:
            break
    fr.close()
    fw.close()

    # If here, download finished. Stop threads
    for i in xrange(10):
        queue.put((None, None))
Session = sessionmaker(engine)
session = Session()


print 'Loading papers:'
for conferenceName in conferences:
    acronym = conferenceName
    print acronym.upper()
    
    # Create a new conference object
    conference = Conference(acronym.upper(), impact[acronym.upper()])
    session.add(conference)
    
    # Load the data into a csv reader
    f = open(os.path.join(dataPath, 'normalised-papers', '%s.csv' % conferenceName), 'rb')
    reader = UnicodeReader(f)
    
    for row in reader:
        # Deconstruct each row
        year = int(row[0])
        author_names = [a.strip() for a in row[1].split(',')]
        title = row[2]
        pages = row[3]
        try:
            num_pages = int(row[4])
        except:
            num_pages = 0
        session_h2 = unidecode(row[5]).strip()
        session_h3 = unidecode(row[6]).strip()
        try:
            selected = row[7]
示例#8
0
#	exit()

	# Choose dataset
	#dataset = 'aliases2'
	dataset = 'gnome'
	dataset = 'icsm'
	
	# Choose dataset size
	datasetsize = 'full'
	#datasetsize = 'sample'
	
	nameEmailData = MyDict()
	if dataset == 'aliases2':
		f = open(os.path.join(dataPath, "aliases2.csv"), "rb")
	#	f = open(os.path.join(dataPath, "testData2.csv"), "rb")
		reader = UnicodeReader(f)
		header = reader.next()
			
		for row in reader:
			try:
				idx = int(row[0])
				name = row[1]
				email = unspam(row[2])
				nameEmailData[idx] = (name, email)
			except:
				print row
		f.close()
		print 'Using the aliases2.csv data set...'
	elif dataset == 'gnome':
		import email.utils
		import email.header
def load_pc():
  session = SessionFactory.get_session()
  # --- Update 8/12/2013 ---
  # Record also the role of PC members (PC Chair or General Chair)
  f = open(os.path.join(DATA_PATH, 'SE-conf-roles.csv'), 'rb')
  reader = UnicodeReader(f)
  header = reader.next()
  roles = {}

  def confName(conf):
      if conf == 'ESEC/FSE':
          return 'FSE'
      elif conf == 'CSMR-WCRE':
          return 'CSMR'
      else:
          return conf

  for row in reader:
      conf = confName(row[0].strip()).lower()
      year = int(row[1])
      name = '%s %s' % (row[2].strip(), row[3].strip())
      try:
          name = nameMap[name]
      except:
          pass
      role = row[5]
      if role == 'Organiser':
          role = 'PC member main track'

      if '?' not in name and role != 'Challenge Chair' and role != 'Data Chair':
          roles[(name, conf, year)] = role

  #Conference;Year;First Name;Last Name;Sex;Role
  #CSMR;2013;Anthony ;Cleve;Male;Program Chair
  #CSMR;2013;Filippo;Ricca;Male;Program Chair
  #CSMR;2013;Maura;Cerioli;Female;General Chair
  # -----------------------


  print 'Loading PC members:'
  for acronym, name, impact in CONFERENCES:
      print acronym.upper()

      # Get the conference object
      try:
          # I already have this PC conference in the database
          conference = session.query(Venue).\
                  filter_by(acronym=acronym).\
                  one()
      except:
          # New conference; add to database
          conference = Venue(acronym.upper(), impact, name, is_conference=True)
          session.add(conference)

      # Load the data into a csv reader
      f = open(os.path.join(DATA_PATH, 'normalised-pc', '%s.csv' % acronym.lower()), 'rb')
      reader = UnicodeReader(f)

      # --- Update 8/12/2013 ---
      withRole = set([(name, year) for (name, conf, year) in roles.keys() if conf==acronym])
      # -----------------------

      for row in reader:
          # Deconstruct each row
          year = int(row[0])
          role = row[1]
          pcMemberName = row[2].strip()

          # --- Update 8/12/2013 ---
          if roles.has_key((pcMemberName, acronym, year)):
              role = roles[(pcMemberName, acronym, year)]
              try:
                  withRole.remove((pcMemberName, year))
              except:
                  pass
          else:
              role = 'PC member main track'
          # -----------------------

          if len(pcMemberName):
              # Get the PC member object
              try:
                  # I already have this PC member in the database
                  pcMember = session.query(Person).\
                          filter_by(name=pcMemberName).\
                          one()
              except:
                  # New person; add to database
                  pcMember = Person(pcMemberName)
                  session.add(pcMember)

              try:
                  membership = session.query(PCMembership).\
                          filter_by(year=year).\
                          filter_by(role=role).\
                          filter_by(pcmember=pcMember).\
                          filter_by(venue=conference).\
                          one()
              except:
                  # New, add to database
                  membership = PCMembership(year, role)

                  membership.pcmember = pcMember
                  membership.venue = conference
                  session.add(membership)

      # --- Update 8/12/2013 ---
      print sorted(withRole)
      # -----------------------
  session.commit()
示例#10
0


dataPath = os.path.abspath("../../../data")

#print normaliseName(u'Liz Burd')
#exit()

conferences = ['icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'icpc', 'fse', 'scam', 'ase']

for conference in conferences:
    g = open(os.path.join(dataPath, 'normalised-pc', '%s.csv' % conference), 'wb')
    writer = UnicodeWriter(g)

    f1 = open(os.path.join(dataPath, 'pc', '%s.csv' % conference), 'rb')
    reader1 = UnicodeReader(f1)    
    for row in reader1:
        year = row[0]
        track = row[1]
        if track == 'main':
            pc = ','.join([normaliseName(name) for name in row[2].split(',') if len(normaliseName(name))])
            writer.writerow([year, track, pc])
    g.close()

exit()

#conferences = ['ase']

allAuthorsSet = set()

for conference in conferences:
示例#11
0
import os
import sys
sys.path.append('..')
from unicodeMagic import UnicodeReader, UnicodeWriter
from dictUtils import MyDict
from nameMap import nameMap
from unidecode import unidecode

#conference = "icse"
conference = sys.argv[1]


# This is the list of DBLP author names (>1.1M people)
# 335078;M. G. J. van den Brand, Mark G. J. van den Brand, Mark van den Brand
f = open(os.path.abspath("../../../data/dblp-author-aliases-stripped.csv"), "rb")
reader1 = UnicodeReader(f)

# Read the list into a map
# reverseLookup['M. G. J. van den Brand'] 
# 	= reverseLookup['Mark G. J. van den Brand'] 
# 	= reverseLookup['Mark van den Brand']
# 	= 335078
reverseLookup = MyDict()
for row in reader1:
    aid = int(row[0])
    aliases = [name.strip() for name in row[1].split(',')]
    for name in aliases:
        reverseLookup[name] = aid

# Read names of conference PC members
# There are two cases: