def extract_unknown_ner(sentences_df,
                        TEXT_COL='sentences',
                        NER_COL='named_entities',
                        ner_port=9199):
    ''' Extracted named entities using Stanford's NER.
        Requires a java server be already launched.

        sentences_df: pandas dataframe with one column that contains non-lowercased sentences
        TEXT_COL: name of column with sentences
        NER_COL: str name of column for output
    '''
    # To run this, you need to set up SNER local server
    # download stanford core nlp (should be a zip file of format stanford-ner-YYYY-MM-DD) (maybe from https://nlp.stanford.edu/software/CRF-NER.shtml#Download)
    # need to start the Java server:
    # cd C:\ProgramData\Anaconda3\Lib\site-packages\sner\stanford-ner-2018-02-27
    # java -Djava.ext.dirs=./lib -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -port 9199 -loadClassifier ./classifiers/english.all.3class.distsim.crf.ser.gz  -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false

    # filter to sentences long enough to have sentiment and player name
    min_length = 10  # characters
    sentences_df = sentences_df[sentences_df[TEXT_COL].str.len() >= min_length]

    # tag using Java
    pos_tagger = Ner(host='localhost', port=ner_port)
    # would love to parallelize this, as it takes ~2-3 hours per year of data
    # ddf = dd.from_pandas(sentences_df)
    sner_entities = lambda text: [
        token for token, part in pos_tagger.get_entities(text)
        if part in {'PERSON', 'ORGANIZATION', 'LOCATION'}
    ]
    sentences_df[NER_COL] = sentences_df[TEXT_COL].apply(
        lambda doc: sner_entities(doc))

    return sentences_df
示例#2
0
 def __init__(self, debug=False):
     self.debug = debug
     self.time = time.time()
     pool = redis.ConnectionPool(host='localhost',port=6379, db=0)
     self.r=redis.Redis(connection_pool = pool)
     self.timecheck = 600
     self.locations = {}
     self.update_location_store()
     self.NER =  Ner(host="localhost", port=9199)
     self.counter = 0
     self.memory={}
     config = load_config("./config/assed_config.json")
     self.APIKEY = config["APIKEYS"]["googlemaps"]
     self.stream_tracker = {}
示例#3
0
 def __init__(self, assed_config, root_name, errorQueue, messageQueue,
              **kwargs):
     multiprocessing.Process.__init__(self)
     # set up DB connections
     self.DB_CONN = get_db_connection(assed_config)
     self.client = NewsApiClient(api_key="f715251d799140f793e63a1aec194920")
     self.root_name = root_name
     self.errorQueue = errorQueue
     self.messageQueue = messageQueue
     # No cached list because we are getting new stuff every day...
     self.config = kwargs["config"]
     self.NER = Ner(host='localhost', port=9199)
     pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
     self.r = redis.Redis(connection_pool=pool)
     pass
示例#4
0
 def start_sner(self):
     command = "java -Djava.ext.dirs=./lib -cp {0} edu.stanford.nlp.ie.NERServer " \
               "-port 9199 -loadClassifier {1}  -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer " \
               "-tokenizerOptions tokenizeNLs=false".format(self.sner_jar_path, self.sner_class_path)
     self.stop()
     self.proc_obj = Popen(args=command)
     self.tagger = Ner(host='localhost', port=9199)
class StanfordNERClient(NER):
    """
    To run the server see:
        pythonmodules/ner/run_stanford.sh
    """
    def __init__(self, host=None, port=None):
        self.host = 'localhost' if host is None else host
        self.port = 9001 if port is None else port
        self.server = None

    def connect(self):
        self.server = Ner(host=self.host, port=self.port)

    def tag(self, text, **kwargs):
        # remove "untokenizable" characters to avoid warning from ner server
        text = bytes(text, 'utf-8').decode('utf-8', 'ignore')
        text = text.replace('\xFF\xFD', '')

        text = str(text).splitlines()
        if self.server is None:
            self.connect()
        try:
            return self._run(text)
        except ConnectionResetError:
            self.connect()
        except ConnectionRefusedError:
            raise ConnectionRefusedError(
                "Connection refused, is the server running at %s:%d? Check run_stanford.sh..."
                % (self.host, self.port))
        return self._run(text)

    def _run(self, text):
        return list(
            itertools.chain(*[self.server.get_entities(line)
                              for line in text]))
示例#6
0
文件: stf_ner.py 项目: ycpan/my_model
class STF_NER:
    def __init__(self):
        self.tagger = Ner(host='localhost', port=8046)

    def get_ner(self, l):
        res = self.tagger.get_entities(l)
        return res
示例#7
0
def startNERServer():
    #os.chdir('./StanfordNER/')
    #subprocess.Popen(['./run_stanfordner_server.sh'])  # start the server up
    #os.chdir('../')
    # get the NER object
    ner_tagger = Ner(host='localhost', port=9199)

    return ner_tagger
示例#8
0
def use_stanford_ner(data):
    entities = []
    helper = Helper
    tagger = Ner(host='localhost', port=9199)

    for row in data:
        sentences = helper.get_sentences(row['supposed_string'])
        ne_sentences = [
            tagger.get_entities(sentence) for sentence in sentences
        ]
        tagged_sentences = [
            helper.transform_stanford_name_entity_to_tree(sentence)
            for sentence in ne_sentences
        ]
        grammar = helper.get_grammar('stanford_ner')
        parsed_sentences = helper.get_parsed_sentences(grammar,
                                                       tagged_sentences)
        entities.extend(helper.extract_entities(parsed_sentences, row))
    return entities
def com_ner(data_type: str, rp: str):
    """
    Function gets the NER tags of the sentence using the sequential model pre-trained by Stanford NLP programs.

     :argument:
        :param data_type: String either `training` or `test`
        :param rp: Absolute path of the root directory of the project
    :return:
        boolean_flag: True for successful operation.
        all_ners: List of NER tags of each line
    """
    # initialize the tagger corresponding to StandfordNER server
    tagger = Ner(host="localhost", port=9199)
    all_ners = []
    read_flag, file = read_file("raw_sentence_{0}".format(data_type), rp)
    if read_flag:
        for line in file:
            word_tags = tagger.get_entities(line)
            ner_tags = [x[1] for x in word_tags]
            all_ners.append(" ".join(ner_tags))
        return True, all_ners
    else:
        return False
示例#10
0
def main(args):
    client = MongoClient(args.connection_string)
    db = client["dax_gcp"]
    collection = db["all_news"]

    tagger = Ner(host='localhost', port=9199)

    operations = []

    records = 0
    start_time = time.time()

    for doc in collection.find({}, no_cursor_timeout=True):

        tags_temp = get_tags(doc["NEWS_TITLE_NewsDim"], tagger)
        if not tags_temp:
            continue
        tags = process_tags(tags_temp)

        new_values = {}

        new_values["tag_header_LOCATION"] = list()
        new_values["tag_header_PERSON"] = list()
        new_values["tag_header_ORGANIZATION"] = list()
        new_values["tag_header_MONEY"] = list()
        new_values["tag_header_PERCENT"] = list()
        new_values["tag_header_DATE"] = list()
        new_values["tag_header_TIME"] = list()

        for word, tag in tags:
            if tag != "O":
                new_values["tag_header_" + tag].append(word)

        operations.append(UpdateOne({"_id": doc["_id"]}, {"$set": new_values}))

        # Send once every 1000 in batch
        if (len(operations) == 1000):
            print("Performing bulk write")
            collection.bulk_write(operations, ordered=False)
            operations = []
            records += 1000
            print("Write done. Saved {} records".format(records))

    if (len(operations) > 0):
        collection.bulk_write(operations, ordered=False)

    print("--- %s seconds ---" % (time.time() - start_time))
    print("Processed {} records".format(records))
示例#11
0
def getPersonNames(string):

    s = string.replace("\n", " . ")
    tagger = Ner(host='localhost', port=9199)
    taggedEng = tagger.get_entities(s)
    tagger = Ner(host='localhost', port=9198)
    taggedInd = tagger.get_entities(s)

    namesList = []
    name = None
    for word in taggedEng:
        if name != None and word[1] == "PERSON":
            name += " " + str(word[0])
        elif name == None and word[1] == "PERSON":
            name = str(word[0])
        elif name != None:
            namesList.append(name.lower().replace("\n", ""))
            name = None
    if name != None:
        namesList.append(name.lower().replace("\n", ""))

    name = None
    for word in taggedInd:
        if name != None and word[1] == "PERSON":
            name += " " + str(word[0])
        elif name == None and word[1] == "PERSON":
            name = str(word[0])
        elif name != None:
            namesList.append(name.lower().replace("\n", ""))
            name = None

    if name != None:
        namesList.append(name.lower().replace("\n", ""))

    n = len(namesList)
    i = 0
    while i < n:
        name = namesList[i]
        j = 0
        while j < n:
            if name in namesList[j] and i != j:
                namesList.pop(i)
                n -= 1
                i -= 1
                break
            j += 1
        i += 1

    return namesList
示例#12
0
def newsinfo(request):
    # nltk.download('vader_lexicon')
    context = {}
    form = StockForm(request.GET)
    form.is_valid()

    beta=0.8
    text = form.cleaned_data['news_input']
    text = re.sub(r"[^\w\s]", '', text) 

    st = Ner(host='localhost',port=9199)

    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(text)
    sentimentResult=""
    for k in sorted(ss):
        sentimentResult+='{0}: {1}, '.format(k, ss[k])
    sentimentResult=sentimentResult[:-2]
    context['sentimentresult']=sentimentResult
    # first part - find pronoun
    start=time.time()
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    entities = nltk.chunk.ne_chunk(tagged)
    wordcounts=Counter(tokens)
    organizationList=[]
    for tag in tagged:
        if(tag[1]=='NNP' and tag[0]!='' and tag[0] not in organizationList):
            organizationList.append(tag[0])

    #second part - stanford NER
    newOrganizationList=st.get_entities(text)
    for org in newOrganizationList:
        if org[1]=='ORGANIZATION' and org[0] not in organizationList: 
            organizationList.append(org[0])

    #third part - nltk NER
    organizationList=get_continuous_chunks(text,organizationList)

    for org in organizationList:
        print(org)
    #use google search to check the term whether is it an organization/corporation
    newOrganizationList=[]    
    for org in organizationList:
        if(checkOrganization(org)):
            newOrganizationList.append(org)

    # print(newOrganizationList)
    tickerList=[]
    filteredOrgList=[]
    wordCountList=[]
    #find ticker
    for orgName in newOrganizationList:
        ticker,tickerOrgName=findTicker(orgName)
        if(ticker!=None and ticker not in tickerList):
            print(orgName)
            print(wordcounts[orgName])
            wordCountList.append(wordcounts[orgName])
            filteredOrgList.append(tickerOrgName)
            tickerList.append(ticker)
    probDict={}
    totalWordCount=sum(wordCountList)
    for idx,org in enumerate(filteredOrgList):
        competitorName=[]
        marketCap=[]
        percentage=1.0/totalWordCount*wordCountList[idx]
        if(org not in probDict):
            probDict[org]=percentage*beta
        else:
            probDict[org]+=percentage*beta
        getCompetitorInfo(tickerList[idx],org,competitorName,marketCap)
        if(competitorName!=[]):
            marketCap.pop(0)
            totalMarketCap=sum(marketCap)
            for i,competitor in enumerate(competitorName):
                if(i<len(marketCap)):
                    if(competitor not in probDict):
                        probDict[competitor]=percentage*(1-beta)*1.0/totalMarketCap*marketCap[i]
                    else:
                        probDict[competitor]+=percentage*(1-beta)*1.0/totalMarketCap*marketCap[i]

    print(probDict)
    labels=[]
    values=[]
    for key, value in probDict.iteritems():
        labels.append(key)
        values.append(value)
    if(len(labels)!=0):
        trace = graph_objs.Pie(labels=labels, values=values,textinfo='none')
        fig = Figure(data=Data([trace]))
        context['piechart']=plot(fig,auto_open=False,output_type='div')

    top10String=""
    sortList=numpy.argsort(values)[::-1]
    for idx,value in enumerate(sortList):
        if(idx==10):
            break
        top10String+=labels[value]+':'+str(values[value]*100)+'%,'

    top10String=top10String[:-1]
    context['top10']=top10String
    orgString=""
    for org in filteredOrgList:
        orgString+=org+','
    orgString=orgString[:-1]
    tickerString=""
    for ticker in tickerList:
        tickerString+=ticker+','
    tickerString=tickerString[:-1]
    context['foundentities']=orgString
    context['foundtickers']=tickerString
    context['result']=urllib2.unquote(text)
    print(context['result'])
    print(context['foundentities'])
    print(context['foundtickers'])
    return render(request, 'news/newsinfo.html', context)
示例#13
0
文件: stf_ner.py 项目: ycpan/my_model
 def __init__(self):
     self.tagger = Ner(host='localhost', port=8046)
示例#14
0
urls = ('/', 'SimpleIndexSearchPage', '/entityAwareSearchPage',
        'EntityAwareSearch', '/searchSimpleIndex', 'SearchSimpleIndex',
        '/searchEntityAwareIndex', 'SearchEntityAwareIndex')

CATEGORY = {
    'b': 'Business',
    'e': 'Entertainment',
    't': 'Science and Technology',
    'm': 'Health'
}
render = web.template.render('templates/', base='layout')
SOLR_SIMPLEINDEX = pysolr.Solr('http://localhost:8983/solr/simpleindex')
SOLR_ENTITYAWAREINDEX = pysolr.Solr(
    'http://localhost:8983/solr/entityawareindex')
STANFORD_NER_SERVER = Ner(host='localhost', port=9199)


def get_web_input(web_input):
    draw = web_input['draw']
    query = web_input['search[value]']
    if len(query) == 0:
        query = '*:*'
    offset = web_input['start']
    count = web_input['length']
    return draw, query, offset, count


def search_simple_index(query, offset, count, draw):
    """
        This function is responsible for hitting the solr endpoint
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag, ne_chunk
from nltk.tree import Tree
from nltk import data as nltk_data
from sner import Ner
from stemming.porter2 import stem
from string import digits, punctuation
import unicodedata

_logger = logging.getLogger(__name__)

if config.NLTK_DATA_PATH is not None:
    nltk_data.path.append(config.NLTK_DATA_PATH)
_stanford_ner_tagger = Ner(host=config.STANFORD_NER_TAGGER_SERVER['host'],
                           port=config.STANFORD_NER_TAGGER_SERVER['port'])

_set_of_stopwords = set(nltk_stopwords.words("english"))
# the following is very useful, but shouldn't be done for the purpose of comparing to SOTA with a standard preprocessing
# for sw in ["BBC", "TheJournal", "ie", "Al", "Jazeera", "News"]:
#     set_of_stopwords.add(sw)


def ner_tokenize(text):
    stemmer = PorterStemmer()

    # unicode categories can be found here http://www.unicode.org/reports/tr44/#General_Category_Values
    chunked = ne_chunk(
        pos_tag(
            word_tokenize("".join(c
                                  for c in unicodedata.normalize('NFD', text)
INDEX_MAP = [
    "ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME",
    "TIMESTAMP"
]

# Location, Time, Person, Organization, Money, Percent, Date
CASELESS_CLASSIFIER = '/usr/share/stanford-ner/classifiers/english.muc.7class.caseless.distsim.crf.ser.gz'

# To use the Stanford NER server run the following command in the stanford-ner directory
'''
java -Xmx3g -Djava.ext.dirs=./lib -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -port 9199 -loadClassifier
./classifiers/english.muc.7class.distsim.crf.ser.gz  -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer
-tokenizerOptions tokenizeNLs=false
'''
STANFORD_NER_HANDLER = Ner(host='localhost', port=9199)


def accumulate(list_of_tuples):
    tokens, entities = zip(*list_of_tuples)
    recognised = defaultdict(set)
    duplicates = defaultdict(list)

    for i, item in enumerate(entities):
        duplicates[item].append(i)

    for key, value in duplicates.items():
        for k, g in groupby(enumerate(value), lambda x: x[0] - x[1]):
            indices = list(map(itemgetter(1), g))
            recognised[key].add(' '.join(tokens[index] for index in indices))
    recognised.pop('O', None)
示例#17
0
class landslide_location_extractor(utils.AssedMessageProcessor.AssedMessageProcessor):
    def __init__(self, debug=False):
        self.debug = debug
        self.time = time.time()
        pool = redis.ConnectionPool(host='localhost',port=6379, db=0)
        self.r=redis.Redis(connection_pool = pool)
        self.timecheck = 600
        self.locations = {}
        self.update_location_store()
        self.NER =  Ner(host="localhost", port=9199)
        self.counter = 0
        self.memory={}
        config = load_config("./config/assed_config.json")
        self.APIKEY = config["APIKEYS"]["googlemaps"]
        self.stream_tracker = {}

    def process(self,message):
        if message["streamtype"] not in self.stream_tracker:
            self.stream_tracker[message["streamtype"]] = {}
            self.stream_tracker[message["streamtype"]]["bad_location"] = 0
            self.stream_tracker[message["streamtype"]]["good_location"] = 0
            self.stream_tracker[message["streamtype"]]["totalcounter"] = 0
        if time.time() - self.time > self.timecheck:
            utils.helper_utils.std_flush("[%s] -- Updating news location store."%utils.helper_utils.readable_time())
            self.update_location_store()
            self.time = time.time()
            for _streamtype in self.stream_tracker:
                utils.helper_utils.std_flush("[%s] -- Processed %i elements from %s with %i good locations and %i bad locations"%(utils.helper_utils.readable_time(), self.stream_tracker[_streamtype]["totalcounter"],_streamtype, self.stream_tracker[_streamtype]["good_location"], self.stream_tracker[_streamtype]["bad_location"]))
                self.stream_tracker[_streamtype]["totalcounter"] = 0
                self.stream_tracker[_streamtype]["good_location"] = 0
                self.stream_tracker[_streamtype]["bad_location"] = 0
        if self.debug:
            utils.helper_utils.std_flush("Processed %i elements from %s with %i good locations and %i bad locations"%(self.stream_tracker[_streamtype]["totalcounter"],_streamtype, self.stream_tracker[_streamtype]["good_location"], self.stream_tracker[_streamtype]["bad_location"]))

        self.stream_tracker[message["streamtype"]]["totalcounter"] += 1
        # Check if location exists
        latitude = None
        longitude = None
        if "location" in message and message["location"] is not None and len(message["location"]) > 0:
            #already have a location
            pass
        else:
            # First location tagging to get locations...
            cleaned_message = str(message["text"].encode("utf-8"))[2:-2]
            cleaned_message = " ".join(nltk.tokenize.word_tokenize(cleaned_message))
            loc_tags = self.NER.get_entities(cleaned_message)
            desc_locations = self.extractLocations(loc_tags)
            locations = " ".join(desc_locations) if len(desc_locations) > 0 else None

            if locations is None:
                # Attempt match...
                for sublocations in self.locations:
                    if sublocations in cleaned_message:
                        locations = sublocations
                        latitude = self.locations[sublocations][0]
                        longitude = self.locations[sublocations][1]
                        break
            else:
                # This is number of location items...
                pass

                #utils.helper_utils.std_flush(self.counter)
                        
            if locations is None:
                self.stream_tracker[message["streamtype"]]["bad_location"] += 1
                return (False, message)

            # location is there, we will attempt geocoding right here... right now... right on this ship
            # With sublocations...
            if latitude is None or longitude is None:
                standardized_location = utils.helper_utils.location_standardize(locations)

                for sublocation in standardized_location.split(":"):
                    if sublocation in self.locations:
                        latitude = self.locations[sublocation][0]
                        longitude = self.locations[sublocation][1]
            
        
            message["location"] = locations
        
        # check if coords already in message
        if message["latitude"] is not None and message["longitude"] is not None:
            pass
        else:
            if latitude is not None and longitude is not None:
                message["latitude"] = str(latitude)
                message["longitude"] = str(longitude)
            else:
                # Attempt to get location from extractor memory (assed:extractor...)
                
                # First normalize...
                extractor_locations = utils.helper_utils.location_standardize(message["location"])
                # Then attempt retrieve
                coordinates = None
                for extractor_sublocation in extractor_locations.split(":"):
                    r_key = utils.helper_utils.extractor_sublocation_key(extractor_sublocation)
                    coordinates = self.r.get(r_key)
                    if coordinates is not None:
                        latlng = coordinates.decode("utf-8").split(",")
                        latitude = float(latlng[0])
                        longitude = float(latlng[1])
                        break
                
                if coordinates is None:
                    # no sublocation exists. We are gonna have to geocode
                    utils.helper_utils.std_flush("[%s] -- Performing geolocation for %s using googlemaps"%(utils.helper_utils.readable_time(), message["location"]))
                    latitude = False
                    while latitude == False:
                        latitude,longitude = utils.helper_utils.lookup_address_only(message["location"], self.APIKEY, self.r)
                        if latitude == False:
                            warnings.warn("[%s] -- WARNING -- Maps API Expired for %s. Trying after 2 hours."%(utils.helper_utils.readable_time(), time.time()))
                            time.sleep(7200)
                    if latitude is not None and longitude is not None:
                        coordinates = str(latitude) + "," + str(longitude)
                        for extractor_sublocation in extractor_locations.split(":"):
                            r_key = utils.helper_utils.extractor_sublocation_key(extractor_sublocation)
                            # TODO ADD TO MEMORY AS WELL
                            self.r.set(r_key, coordinates, ex=259200)
                    
            if latitude is not None and longitude is not None:
                message["latitude"] = str(latitude)
                message["longitude"] = str(longitude)
            else:
                self.stream_tracker[message["streamtype"]]["bad_location"] += 1
                return (False, message)
        self.stream_tracker[message["streamtype"]]["good_location"] += 1
        return (True, message)
        


    def update_location_store(self,):
        self.locations = {}
        for _key in self.r.scan_iter(match="assed:sublocation:*", count=500):
            # keep only the first key location
            key_location = _key.decode("utf-8").split("assed:sublocation:")[1]
            if key_location.strip():
                key_coords = self.r.get(_key).decode("utf-8").split(",")
                latitude = float(key_coords[0])
                longitude = float(key_coords[1])
                self.locations[key_location] = (latitude, longitude)


    def extractLocations(self,temp_loc_tags):
        locations = []
        temp_loc=[]
        if temp_loc_tags[0][1] == 'LOCATION':
            temp_loc.append(temp_loc_tags[0][0])
        for entry in temp_loc_tags[1:]:
            if entry[1] == 'LOCATION':
                temp_loc.append(entry[0])
            else:
                if temp_loc:
                    locations.append(' '.join(temp_loc))
                    temp_loc=[]
        if temp_loc:
            locations.append(' '.join(temp_loc))
        return locations
示例#18
0
class News(multiprocessing.Process):
    def __init__(self, assed_config, root_name, errorQueue, messageQueue,
                 **kwargs):
        multiprocessing.Process.__init__(self)
        # set up DB connections
        self.DB_CONN = get_db_connection(assed_config)
        self.client = NewsApiClient(api_key="f715251d799140f793e63a1aec194920")
        self.root_name = root_name
        self.errorQueue = errorQueue
        self.messageQueue = messageQueue
        # No cached list because we are getting new stuff every day...
        self.config = kwargs["config"]
        self.NER = Ner(host='localhost', port=9199)
        pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
        self.r = redis.Redis(connection_pool=pool)
        pass

    def run(self, ):

        try:
            for event_topic in self.config["topic_names"]:
                if not self.config["topic_names"][event_topic][
                        "high_confidence"]["valid"]:
                    continue
                self.messageQueue.put("News downloader - working on %s" %
                                      event_topic)
                event_topic_key = str(
                    self.config["topic_names"][event_topic]["index"])
                self.cached_list = self.getCachedList(event_topic_key)
                stopwords = self.config["topic_names"][event_topic][
                    "stopwords"]
                keyword_set = self.config["topic_names"][event_topic][
                    "high_confidence"]["keywords"]
                articles = []
                for keyword in keyword_set:
                    try:
                        response = self.client.get_everything(q=keyword,
                                                              page_size=100)
                        articles += response["articles"]
                    except Exception as e:
                        self.messageQueue.put(
                            "NewsAPI for %s-%s failed with error: %s" %
                            (event_topic, keyword, repr(e)))

                article_content, article_location = self.getArticleDetails(
                    articles, stopwords)

                self.insertNews(article_content, event_topic_key)
                self.updateRedisLocations(article_location)

            self.DB_CONN.close()
            self.messageQueue.put(
                "Completed News download successfully at %s." %
                readable_time())

        except Exception as e:
            traceback.print_exc()
            self.errorQueue.put((self.root_name, str(e)))

    def getArticleDetails(self, articles, stopwords):
        article_content = []
        article_location = []
        exist_skip, stop_skip, location_skip, coordinate_skip = 0, 0, 0, 0
        for article in articles:
            item = {}
            item["id"] = base64.b64encode(str.encode(article["url"])).decode()
            if item["id"] in self.cached_list:
                exist_skip += 1
                continue
            item["source"] = article["source"]["name"]
            item["url"] = article["url"]
            item["time"] = dateutil.parser.parse(
                article["publishedAt"]).replace(
                    tzinfo=tz.gettz('UTC')).astimezone(
                        tz=tz.gettz('EDT')).strftime("%Y-%m-%d %H:%M:%S")
            item["title"] = article["title"]
            item["text"] = article["description"]

            #We are doing an extremely basic lookup <-- if it has landslide keyword, we accept.
            #Since this is alreadya landslide feed, google/whatever has better classifiers. We exploit those to create a super simple keyword filter.
            search_flag = False
            search_counter = 0
            rText = item["text"]
            if "content" in article and article["content"] is not None and len(
                    article["content"]) > 0:
                rText += article["content"]
            while not search_flag and search_counter < len(stopwords):
                if stopwords[search_counter] in rText:
                    search_flag = True
                search_counter += 1
            if search_flag:
                stop_skip += 1
                continue

            # Description based location
            temp_loc_tags = self.NER.get_entities(item["text"])
            desc_locations = self.extractLocations(temp_loc_tags)
            content_locations = []
            try:
                temp_loc_tags = self.NER.get_entities(" ".join(
                    nltk.tokenize.word_tokenize(article["content"])))
                content_locations = self.extractLocations(temp_loc_tags)
            except (TypeError, IndexError):
                # TypeError -- if content is empty in article. IndexError -- if content is not None, but still empty
                pass

            # create location set - take unique from both desc and content_location, after normalization...
            item["description_location"] = [
                location_normalize(item) for item in desc_locations
            ]
            item["content_location"] = [
                location_normalize(item) for item in content_locations
            ]

            final_locations = list(
                set(item["description_location"] + item["content_location"]))
            if len(final_locations) == 0:
                location_skip += 1
                continue
            item["locations"] = final_locations

            lat, lng = lookup_address_only(
                desc_locations, self.config["APIKEYS"]["googlemaps"], self.r)
            if lat == False:
                raise ValueError("Ran out of GoogleMaps daily keys")
            if lat is None or lng is None:
                coordinate_skip += 1
                continue
            item["latitude"] = lat
            item["longitude"] = lng
            item["cell"] = generate_cell(lat, lng)

            article_content.append(item)
            article_location.append({
                "name": final_locations,
                "lat": lat,
                "lng": lng
            })

        self.messageQueue.put(
            "Obtained News with: %i items and skipped \n\texisting %i items\n\tstopword %i items, \n\tmissing location %i items \n\tmissing coordinates %i items"
            % (len(article_content), exist_skip, stop_skip, location_skip,
               coordinate_skip))
        return article_content, article_location

    def extractLocations(self, temp_loc_tags):
        locations = []
        temp_loc = []
        if temp_loc_tags[0][1] == 'LOCATION':
            temp_loc.append(temp_loc_tags[0][0])
        for entry in temp_loc_tags[1:]:
            if entry[1] == 'LOCATION':
                temp_loc.append(entry[0])
            else:
                if temp_loc:
                    locations.append(' '.join(temp_loc))
                    temp_loc = []
        if temp_loc:
            locations.append(' '.join(temp_loc))
        return locations

    def convertDateFromTime(self, tm):
        '''
        Convert datetime to MySQL's datetime from time structure.
        '''
        return time.strftime("%Y-%m-%d %H:%M:%S", tm)

    def getCachedList(self, event_topic):
        cachedlist = set()
        cursor = self.DB_CONN.cursor()
        select = "SELECT item_id FROM HCS_News where timestamp > %s and topic_name = %s" % (
            (datetime.now() - timedelta(days=5)).strftime("%Y-%m-%d"),
            event_topic)
        cursor.execute(select)
        results = cursor.fetchall()
        cursor.close()
        for row in results:
            cachedlist.add(row[0])
        self.messageQueue.put("News cachedlist has  %i items in last 5 days" %
                              (len(cachedlist)))
        return cachedlist

    def insertNews(self, article_items, event_topic_key):
        event_topic_key = int(event_topic_key)
        cursor = self.DB_CONN.cursor()
        for item in article_items:

            insert = 'INSERT INTO HCS_News ( \
                        item_id, link, \
                        cell, latitude, longitude, timestamp, location, news_src, text, topic_name) \
                        VALUES (%s,%s,%s,%s,%s,%s,%s,%s, %s,%s)'
            params = (item['id'], item['url'], item['cell'], \
                    item['latitude'], item['longitude'], item['time'], ",".join(item["locations"]), item['source'], item['text'], event_topic_key)
            try:
                cursor.execute(insert, params)
                self.DB_CONN.commit()
            except Exception as e:
                traceback.print_exc()
                self.messageQueue.put('Failed to insert %s with error %s' %
                                      (item["id"], repr(e)))
        cursor.close()

    def updateRedisLocations(self, article_location):
        # get REDIS connection

        totalLocations = len(article_location)
        sublocations = 0
        for location in article_location:
            converted_location = " ".join(location["name"])
            location_std = location_standardize(converted_location)
            location_key = high_confidence_streamer_key("news:location:" +
                                                        location_std)
            self.r.set(location_key, converted_location, ex=259200)

            point_str = str(location["lat"]) + "," + str(location["lng"])
            for sublocation in location_std.split(":"):
                sublocationkey = sublocation_key(sublocation)
                self.r.set(sublocationkey, point_str, ex=259200)
                sublocations += 1
        self.messageQueue.put(
            "Completed News with: %i locations and %i sublocations" %
            (totalLocations, sublocations))
示例#19
0
import re
import requests
import os
import unicodedata
from bs4 import BeautifulSoup
from IPython.display import display
import mysql.connector, requests, os, os.path
from mysql.connector import Error, errorcode
from datetime import datetime
import sys
from sner import Ner
# os.environ['CLASSPATH'] = 'stanford-ner-4.0.0/stanford-ner.jar'
# from nltk.tag import StanfordNERTagger
# os.getenv('CLASSPATH') = '../Downloads/stanford-postagger.jar'
# nltk.download()
st = Ner(host='172.104.7.112', port=9199)
# print(st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) )
sys.setrecursionlimit(1000)


# rootdir = os.getcwd() + '/mnt'
# for subdir, dirs, files in os.walk(rootdir):
#     for file in files:
#         print(dirs,file)
def restore_windows_1252_characters(restore_string):
    """
        Replace C1 control characters in the Unicode string s by the
        characters at the corresponding code points in Windows-1252,
        where possible.
    """
    def to_windows_1252(match):
 def connect(self):
     self.server = Ner(host=self.host, port=self.port)
示例#21
0
def get_stanford_ner_client():
    """
    Get an instance of the Ner http client.
    :return:
    """
    return Ner(host=STANFORD_NER_URL, port=STANFORD_NER_PORT)
示例#22
0
from nltk.tag import StanfordNERTagger
from nltk.internals import find_jars_within_path
import labdatascript as lb
import re
import unicodedata
from sner import Ner
from nltk import tokenize

st = Ner(host='localhost', port=9199)  #7-class
# stanford_dir = st._stanford_jar.rpartition('/')[0] #these lines are just a hack to get around the java problem
# from nltk.internals import find_jars_within_path
# stanford_jars = find_jars_within_path(stanford_dir)
# st._stanford_jar = ':'.join(stanford_jars)


def analyze_texts(dict,
                  top_num=5
                  ):  #takes soups_dict of texts gathered, creates corpus
    '''
    Draws from labdatascript functions for NLP analysis of gathered texts

    Input: result dictionary from web crawler with gathered text as values, top n to slice from analysis results

    Output: dict with wordcounts, bi + tri + quad grams, topic model

    Further: takes top x values from ngrams, passed in as parameter in function call
    '''
    corpus = []
    for i in dict.values():
        i = re.sub('[0-9]+', '', i)
        corpus.append(i)
示例#23
0
# file from theof.py
cand_file = " "
# output
outputfile = " "
k = 0

with open(cand_file, "r") as input, open(outputfile, "w") as output:
    reader = csv.reader(input, delimiter="\t")
    writer = csv.writer(output, delimiter="\t", quoting=csv.QUOTE_ALL)
    for row in reader:
        articleid = row[0]
        source = row[1]
        sent = row[2]
        # using the server from stanford ner tagger
        tagger = Ner(host="localhost", port=9199)
        # tagging the candidate sentence
        tag = tagger.get_entities(sent)
        # check if all source words are tagged with "PERSON"
        for i in range(len(tag)):
            flag = False
            if tag[i][0] == first_pattern:
                source_person = []
                for j in range(1, min(6, len(tag) - i)):
                    if (tag[i + j][1] != "PERSON"
                            and tag[i + j][0] != second_pattern
                            and tag[i + j][0] != first_pattern):
                        break
                    elif tag[i + j][0] == first_pattern:
                        source_person = []
                        source_person = []
示例#24
0
import datetime
import re
import numpy as np
from nltk import word_tokenize
import pandas as pd

#for the named entity recoginition
from sner import Ner
tagger = Ner(host="localhost", port=9199)


#checks if the user's full name is included in the meeting title, if either username or title blank returns 0
def users_fullname(meeting):
    if pd.isnull(meeting["username"]) or pd.isnull(meeting["title"]):
        return 0
    if str(meeting["username"]).lower() in str(meeting["title"]).lower():
        return 1
    else:
        return 0


# returns 1 if the meeting occurs on a nonworkday, returns 0 if the starttime is blank
def not_workday(meeting):
    if pd.isnull(meeting["starttime"]):
        return 0
    if meeting["starttime"].isocalendar()[2] >= 6:
        return 1
    else:
        return 0