def genConferenceIdFeature(instances, paperList, maxConferenceId): sys.stderr.write("genConferenceIdFeature\n") d = {} for line in paperList: paperId = int(line[0]) conferenceId = int(line[3]) d[paperId] = conferenceId feature = Feature(maxConferenceId) for instance in instances: authorId, paperId = instance[0], instance[1] conferenceId = d[paperId] + 1 # -1 feature.addLine([[conferenceId, 1.0]]) feature.fix() return feature
def genJournalIdFeature(instances, paperList, maxJournalId): sys.stderr.write("genJournalIdFeature\n") d = {} for line in paperList: paperId = int(line[0]) journalId = int(line[4]) d[paperId] = journalId feature = Feature(maxJournalId) for instance in instances: authorId, paperId = instance[0], instance[1] journalId = d[paperId] + 1 # -1 feature.addLine([[journalId, 1.0]]) feature.fix() return feature
def genCoauthorFeature(instances, pathFname, maxAuthorId): ''' return Feature(sparse, [features]) ''' sys.stderr.write("genCoauthorFeature\n") paperAuthorDict = {} csvReader = csv.reader(file(pathFname)) csvReader.next() counter = Counter("paperAuthorDict") for line in csvReader: counter.inc() authorId, paperId = int(line[0]), int(line[1]) paperAuthorDict.setdefault(authorId, set()) paperAuthorDict[authorId].add(paperId) feature = Feature(maxAuthorId) counter = Counter("instance", 1000) for line in instances: counter.inc() authorId, paperId = line[0], line[1] feature.addLine(map(lambda x: [int(x), 1.0], paperAuthorDict[paperId])) feature.fix() return feature