예제 #1
0
 def load_citation_data(self):
     """Adds citation links to the paper objects """
     print "Loading paper citations"
     dirs = get_dir_names(self.releasePath)
     #remove directories that are not years
     for dir in dirs:
         dirName = dir.split("/")[len(dir.split("/"))-1]
         if dirName.isdigit():
             continue
         else:
             dirs.remove(dir)
     for dir in dirs:
         citationFileReader = open(dir+"/acl.txt")
         for line in citationFileReader:
             decodedLine = line.decode("iso-8859-1")
             encodedLine = decodedLine.encode("utf-8", 'ignore')
             cleanedLine = line.replace('\n', '').rstrip()
             if cleanedLine == '':
             # if it is a blank line skip it
                 continue
             lineParts = cleanedLine.split("==>")
             sourcePaperId = lineParts[0].strip()
             citedPaperId = lineParts[1].strip()
             if self.paperInfo.has_key(citedPaperId):
                 paper = self.paperInfo[citedPaperId]
                 if paper.has_key(self.paperFields.CITATION_LIST):
                     paper[self.paperFields.CITATION_LIST].append(sourcePaperId)
                 else:
                     paper[self.paperFields.CITATION_LIST] = []
                     paper[self.paperFields.CITATION_LIST].append(sourcePaperId)
         citationFileReader.close()    
예제 #2
0
 def load_paper_metadata(self):
     """creates initial paper objects with metadata from acl-metadata.txt """
     print "Loading paper metadata"
     dirs = get_dir_names(self.releasePath)
     #remove directories that are not years
     for dir in dirs:
         dirName = dir.split("/")[len(dir.split("/"))-1]
         if dirName.isdigit():
             continue
         else:
             dirs.remove(dir)
             
     currentPaper = {}
     isLineContinuation = False
     for dir in dirs:
         metaDataReader = open(dir+"/acl-metadata.txt", 'r')
         prevLine = ""
         prevValue = ""
         prevField = ""
         for line in metaDataReader:
             decodedLine = line.decode("iso-8859-1")
             encodedLine = decodedLine.encode("utf-8", 'ignore')
             cleanedLine = encodedLine.replace("\n", '').rstrip()
             if not isLineContinuation:
                 #checking for continuation lines
                 if cleanedLine == '':
                     #if line is blank create new paper object and skip line
                     if prevLine == '' and cleanedLine == '':
                         prevLine = cleanedLine
                         continue
                     paperId = currentPaper[self.paperFields.AAN_ID]
                     self.paperInfo[paperId] = currentPaper
                     currentPaper = {}
                     prevLine = cleanedLine
                     continue
                 lineParts = cleanedLine.split("=")
                 if len(lineParts) == 2:
                     field = lineParts[0].strip()
                     value = lineParts[1].strip()
                     if '}' in value:
                         self.load_field_value(currentPaper, field, value)
                         isLineContinuation = False
                     else:
                         prevField = field
                         prevValue = value
                         isLineContinuation = True
                 else:
                     #continued line      
                     field = prevField
                     value = prevValue +" "+cleanedLine
                     if '}' in value:
                         self.load_field_value(currentPaper, field, value)
                         isLineContinuation = False
                     else:
                         prevField = field
                         prevValue = value
                         isLineContinuation = True
             prevLine = cleanedLine
             
         metaDataReader.close()