Пример #1
0
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 12 16:06:46 2013

@author: elias
"""
import os, collections
import medeley_fetch

def find_and_print(keywords):
    index_dictionary = collections.defaultdict(set)
    
    for dirpath, dirnames, filenames in os.walk("NPs"):
        for filename in filenames:
            with open(os.path.join("NPs", filename), 'r') as file:
                for line in file:
                    split = line.strip().split("\t")
                    for keyword in keywords:
                        if keyword in split:
                            index_dictionary[keyword].add(filename[:filename.index(".")])
    
    return set.intersection(*index_dictionary.values())

if __name__=="__main__":
    keywords = ["ocean acidification", "calcification"]
    
    ids = find_and_print(keywords)

    for id in ids:
        print medeley_fetch.get_abstract_for_id(id)
        print "\n"
Пример #2
0
 def parse_abstract(self, id):
     count_dict = dict()
     if id in self.blacklist:
         print 'Id {0} is blacklisted.'.format(id)
         return count_dict
     path = "NPs/" + id + ".nps"
     try:
         with open(path, 'r') as file:
             for line in file:
                 line = line.strip()
                 split = line.split("\t")
                 stemmed_np = split[0]
                 count = split[1]
                 unstemmed = dict()
                 for i in xrange(2,len(split),2):
                     unstemmed[split[i]] = int(split[i+1])
                 count_dict[stemmed_np] = [int(count), unstemmed]
     except IOError:
         try:
             print "Parsing " + str(id)
             try:
                 abstract = medeley_fetch.get_abstract_for_id(id)
             except Exception as e:
                 if e.args[0] == "TooManyRequestsException":
                     print "Skipping due to server overload, consider halting program..."
                 elif e.args[0] == "PaperHasNoAbstractException":
                     print "Object has no abstract, probably not a paper..."
                 else: 
                     print "Unknown exception occured when fetching paper..."
                 return count_dict
             # Can happen due to server overload, but apparently for other reasons as well
             parse = self.corenlp.parse(abstract)
             document = json.loads(parse)
             with open("Parses/" + id + ".json", 'w') as file:
                 file.write(parse)
             # Extract all the nps from the parse trees
             # TODO: Not that important, I guess
             for sentence in document['sentences']:
                 parse_tree = sentence['parsetree']
                 nltk_tree = Tree(parse_tree)
                     
                 nps = self.treemanipulator.get_all_np_variations(nltk_tree)
                 for original_np in nps:      
                     if original_np != "":
                         stemmed_np = self.stemmer.stem_string(original_np)
                         if stemmed_np in count_dict.keys():
                             count_dict[stemmed_np][0] += 1
                             count_dict[stemmed_np][1][original_np] += 1
                         else:
                             count_dict[stemmed_np] = [1, defaultdict(int)]
                             count_dict[stemmed_np][1][original_np] = 1
             with open(path, 'w') as file:
                 for key in count_dict.iterkeys():
                     file.write(str(key) + "\t" + str(count_dict[key][0]) + "\t")
                     for original_np in count_dict[key][1].iterkeys():
                         file.write(str(original_np) + "\t" + str(count_dict[key][1][original_np]) + "\t")
                     file.write("\n")
         except pexpect.ExceptionPexpect:
             print "Timeout during parsing. Verify that the content is rubbish, and add to the blacklist..."
             exit()
     return count_dict