def load_citation_data(self): """Adds citation links to the paper objects """ print "Loading paper citations" dirs = get_dir_names(self.releasePath) #remove directories that are not years for dir in dirs: dirName = dir.split("/")[len(dir.split("/"))-1] if dirName.isdigit(): continue else: dirs.remove(dir) for dir in dirs: citationFileReader = open(dir+"/acl.txt") for line in citationFileReader: decodedLine = line.decode("iso-8859-1") encodedLine = decodedLine.encode("utf-8", 'ignore') cleanedLine = line.replace('\n', '').rstrip() if cleanedLine == '': # if it is a blank line skip it continue lineParts = cleanedLine.split("==>") sourcePaperId = lineParts[0].strip() citedPaperId = lineParts[1].strip() if self.paperInfo.has_key(citedPaperId): paper = self.paperInfo[citedPaperId] if paper.has_key(self.paperFields.CITATION_LIST): paper[self.paperFields.CITATION_LIST].append(sourcePaperId) else: paper[self.paperFields.CITATION_LIST] = [] paper[self.paperFields.CITATION_LIST].append(sourcePaperId) citationFileReader.close()
def load_paper_metadata(self): """creates initial paper objects with metadata from acl-metadata.txt """ print "Loading paper metadata" dirs = get_dir_names(self.releasePath) #remove directories that are not years for dir in dirs: dirName = dir.split("/")[len(dir.split("/"))-1] if dirName.isdigit(): continue else: dirs.remove(dir) currentPaper = {} isLineContinuation = False for dir in dirs: metaDataReader = open(dir+"/acl-metadata.txt", 'r') prevLine = "" prevValue = "" prevField = "" for line in metaDataReader: decodedLine = line.decode("iso-8859-1") encodedLine = decodedLine.encode("utf-8", 'ignore') cleanedLine = encodedLine.replace("\n", '').rstrip() if not isLineContinuation: #checking for continuation lines if cleanedLine == '': #if line is blank create new paper object and skip line if prevLine == '' and cleanedLine == '': prevLine = cleanedLine continue paperId = currentPaper[self.paperFields.AAN_ID] self.paperInfo[paperId] = currentPaper currentPaper = {} prevLine = cleanedLine continue lineParts = cleanedLine.split("=") if len(lineParts) == 2: field = lineParts[0].strip() value = lineParts[1].strip() if '}' in value: self.load_field_value(currentPaper, field, value) isLineContinuation = False else: prevField = field prevValue = value isLineContinuation = True else: #continued line field = prevField value = prevValue +" "+cleanedLine if '}' in value: self.load_field_value(currentPaper, field, value) isLineContinuation = False else: prevField = field prevValue = value isLineContinuation = True prevLine = cleanedLine metaDataReader.close()