# -*- coding: utf-8 -*- """ Created on Tue Nov 12 16:06:46 2013 @author: elias """ import os, collections import medeley_fetch def find_and_print(keywords): index_dictionary = collections.defaultdict(set) for dirpath, dirnames, filenames in os.walk("NPs"): for filename in filenames: with open(os.path.join("NPs", filename), 'r') as file: for line in file: split = line.strip().split("\t") for keyword in keywords: if keyword in split: index_dictionary[keyword].add(filename[:filename.index(".")]) return set.intersection(*index_dictionary.values()) if __name__=="__main__": keywords = ["ocean acidification", "calcification"] ids = find_and_print(keywords) for id in ids: print medeley_fetch.get_abstract_for_id(id) print "\n"
def parse_abstract(self, id): count_dict = dict() if id in self.blacklist: print 'Id {0} is blacklisted.'.format(id) return count_dict path = "NPs/" + id + ".nps" try: with open(path, 'r') as file: for line in file: line = line.strip() split = line.split("\t") stemmed_np = split[0] count = split[1] unstemmed = dict() for i in xrange(2,len(split),2): unstemmed[split[i]] = int(split[i+1]) count_dict[stemmed_np] = [int(count), unstemmed] except IOError: try: print "Parsing " + str(id) try: abstract = medeley_fetch.get_abstract_for_id(id) except Exception as e: if e.args[0] == "TooManyRequestsException": print "Skipping due to server overload, consider halting program..." elif e.args[0] == "PaperHasNoAbstractException": print "Object has no abstract, probably not a paper..." else: print "Unknown exception occured when fetching paper..." return count_dict # Can happen due to server overload, but apparently for other reasons as well parse = self.corenlp.parse(abstract) document = json.loads(parse) with open("Parses/" + id + ".json", 'w') as file: file.write(parse) # Extract all the nps from the parse trees # TODO: Not that important, I guess for sentence in document['sentences']: parse_tree = sentence['parsetree'] nltk_tree = Tree(parse_tree) nps = self.treemanipulator.get_all_np_variations(nltk_tree) for original_np in nps: if original_np != "": stemmed_np = self.stemmer.stem_string(original_np) if stemmed_np in count_dict.keys(): count_dict[stemmed_np][0] += 1 count_dict[stemmed_np][1][original_np] += 1 else: count_dict[stemmed_np] = [1, defaultdict(int)] count_dict[stemmed_np][1][original_np] = 1 with open(path, 'w') as file: for key in count_dict.iterkeys(): file.write(str(key) + "\t" + str(count_dict[key][0]) + "\t") for original_np in count_dict[key][1].iterkeys(): file.write(str(original_np) + "\t" + str(count_dict[key][1][original_np]) + "\t") file.write("\n") except pexpect.ExceptionPexpect: print "Timeout during parsing. Verify that the content is rubbish, and add to the blacklist..." exit() return count_dict