def extractDataFromLink(queue, urls, filename, fileCount): dbObj = mdb.mongodbDatabase('doc_collection') docs = dbObj.docCollection down_doc = docs.find_one({'url':urls,'primaryEnt':filename}) if(down_doc == None or (down_doc['documents'] == None) or len(down_doc['documents'])==0): try: #print "down load docs for ",urls cleanText = '' if(urls.endswith('.pdf')): print "############# found pdf #############" proxy_support = urllib2.ProxyHandler({"http":"proxy.iisc.ernet.in:3128"}) opener = urllib2.build_opener(proxy_support) urllib2.install_opener(opener) with open('filename','wb') as f: f.write(urllib2.urlopen(URL).read()) f.close() content = convert('filename') cleanText = content.encode('utf-8','ignore') else: extractor = Extractor(extractor='ArticleExtractor', url=urls) extracted_text = extractor.getText() cleanText = cleanTheExtraction(extracted_text) sentenceList = tokenizer.tokenize(cleanText) #get sentences if(len(sentenceList) > minLen): # write to a file if the extraction size is greater than min no. of sentences curFile = filename+str(fileCount)+'.txt' senList = [] for l in sentenceList: newl = l.encode('utf-8','ignore') senList.append(newl) document = {'url': urls, 'documents':senList, 'primaryEnt':filename} if down_doc == None: post_id = docs.insert_one(document) #.inserted_id else: docs.replace_one({'url': urls, 'primaryEnt':filename},document,True) sentenceString = ' '.join(sentenceList) getTripleList(sentenceString,urls,filename)# call a function to do corenlp->sentcreate->ollie except Exception, e: print "error in boilerpipe code: ",e," url: ", urls exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno)
def extractDataFromLink(queue, urls, filename, fileCount): dbObj = mdb.mongodbDatabase('doc_collection') docs = dbObj.docCollection down_doc = docs.find_one({'url':urls,'primaryEnt':filename}) if(down_doc == None or (down_doc['documents'] == None) or len(down_doc['documents'])==0): try: extractor = Extractor(extractor='ArticleExtractor', url=urls) extracted_text = extractor.getText() cleanText = cleanTheExtraction(extracted_text) sentenceList = tokenizer.tokenize(cleanText) #get sentences if(len(sentenceList) > minLen): # write to a file if the extraction size is greater than min no. of sentences curFile = filename+str(fileCount)+'.txt' # p = file('/tmp/extractions/'+curFile, 'w') for s in sentenceList: try: if(ord(s) < 48 or ord(s) > 122): sentenceList.remove(s) else: print "@@@@@",s # p.write(s) # p.write(" ") except: sentenceList.remove(s) # p.close() document = {'url': urls, 'documents':sentenceList, 'primaryEnt':filename} if down_doc == None: post_id = docs.insert_one(document) #.inserted_id else: docs.replace_one({'url': urls, 'primaryEnt':filename},document,True) sentenceString = ' '.join(sentenceList) getTripleList(sentenceString,urls,filename)# call a function to do corenlp->sentcreate->ollie except Exception, e: # print "whats the error ",e # print urls pass
else: print "@@@@@",s # p.write(s) # p.write(" ") except: sentenceList.remove(s) # p.close() document = {'url': urls, 'documents':sentenceList, 'primaryEnt':filename} if down_doc == None: post_id = docs.insert_one(document) #.inserted_id else: docs.replace_one({'url': urls, 'primaryEnt':filename},document,True) sentenceString = ' '.join(sentenceList) getTripleList(sentenceString,urls,filename)# call a function to do corenlp->sentcreate->ollie except Exception, e: # print "whats the error ",e # print urls pass else: try: curFile = filename+str(fileCount)+'.txt' oldVal = docs.find_one({'url':urls,'primaryEnt':filename}) sentenceList = oldVal['documents'] sentenceString = ' '.join(sentenceList) getTripleList(sentenceString,urls,filename)# call a function to do corenlp->sentcreate->ollie except: pass dbObj.client.close()