def extractDataFromLink(queue, urls, filename, fileCount):
    dbObj = mdb.mongodbDatabase('doc_collection')
    docs = dbObj.docCollection
    down_doc = docs.find_one({'url':urls,'primaryEnt':filename})

    if(down_doc == None or (down_doc['documents'] == None) or len(down_doc['documents'])==0):
        try:
        #print "down load docs for ",urls
            cleanText = ''
            if(urls.endswith('.pdf')):
                print "############# found pdf #############"
                proxy_support = urllib2.ProxyHandler({"http":"proxy.iisc.ernet.in:3128"})
                opener = urllib2.build_opener(proxy_support)
                urllib2.install_opener(opener)
                with open('filename','wb') as f:
                    f.write(urllib2.urlopen(URL).read())
                    f.close()
                content = convert('filename')
                cleanText = content.encode('utf-8','ignore')
            else:
                extractor = Extractor(extractor='ArticleExtractor', url=urls)
                extracted_text = extractor.getText()
                cleanText = cleanTheExtraction(extracted_text)

            sentenceList = tokenizer.tokenize(cleanText)    #get sentences

            if(len(sentenceList) > minLen):           # write to a file if the extraction size is greater than min no. of sentences
                curFile = filename+str(fileCount)+'.txt'
                senList = []
                for l in sentenceList:
                    newl = l.encode('utf-8','ignore')
                    senList.append(newl)

                document = {'url': urls, 'documents':senList, 'primaryEnt':filename}
                if down_doc == None:
                    post_id = docs.insert_one(document) #.inserted_id
                else:
                    docs.replace_one({'url': urls, 'primaryEnt':filename},document,True)

                sentenceString = ' '.join(sentenceList)
                getTripleList(sentenceString,urls,filename)# call a function to do corenlp->sentcreate->ollie
        except Exception, e:
            print "error in boilerpipe code: ",e," url: ", urls
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            print(exc_type, fname, exc_tb.tb_lineno)
def extractDataFromLink(queue, urls, filename, fileCount):
    dbObj = mdb.mongodbDatabase('doc_collection')
    docs = dbObj.docCollection
    down_doc = docs.find_one({'url':urls,'primaryEnt':filename})
    
    if(down_doc == None or (down_doc['documents'] == None) or len(down_doc['documents'])==0):
        try:
            extractor = Extractor(extractor='ArticleExtractor', url=urls)
            extracted_text = extractor.getText()
            cleanText = cleanTheExtraction(extracted_text)
            sentenceList = tokenizer.tokenize(cleanText)    #get sentences
            
            if(len(sentenceList) > minLen):           # write to a file if the extraction size is greater than min no. of sentences
                curFile = filename+str(fileCount)+'.txt'
                # p = file('/tmp/extractions/'+curFile, 'w')
                for s in sentenceList:
                    try:
                        if(ord(s) < 48 or ord(s) > 122):
                            sentenceList.remove(s)
                        else:
                            print "@@@@@",s 
                #         p.write(s)
                #         p.write(" ")
                    except:
                        sentenceList.remove(s)
                # p.close()

                document = {'url': urls, 'documents':sentenceList, 'primaryEnt':filename}
                if down_doc == None:
                    post_id = docs.insert_one(document) #.inserted_id
                else:
                    docs.replace_one({'url': urls, 'primaryEnt':filename},document,True)
                
                sentenceString = ' '.join(sentenceList)
                getTripleList(sentenceString,urls,filename)# call a function to do corenlp->sentcreate->ollie   		
        except Exception, e:
            # print "whats the error ",e
            # print urls
            pass
                        else:
                            print "@@@@@",s 
                #         p.write(s)
                #         p.write(" ")
                    except:
                        sentenceList.remove(s)
                # p.close()

                document = {'url': urls, 'documents':sentenceList, 'primaryEnt':filename}
                if down_doc == None:
                    post_id = docs.insert_one(document) #.inserted_id
                else:
                    docs.replace_one({'url': urls, 'primaryEnt':filename},document,True)
                
                sentenceString = ' '.join(sentenceList)
                getTripleList(sentenceString,urls,filename)# call a function to do corenlp->sentcreate->ollie   		
        except Exception, e:
            # print "whats the error ",e
            # print urls
            pass
    else:
        try:
            curFile = filename+str(fileCount)+'.txt'
            oldVal = docs.find_one({'url':urls,'primaryEnt':filename})
            sentenceList = oldVal['documents']
            sentenceString = ' '.join(sentenceList)
            getTripleList(sentenceString,urls,filename)# call a function to do corenlp->sentcreate->ollie	
        except:
            pass
    dbObj.client.close()