def treeify_tweet(tweet):
    tree = ParseTree(tweet['tokens'], tweet['chunk'], tweet['token_tags'],
                     tweet['parser'])
    rst = []

    segment = []
    for idx in range(tree.get_size()):
        segment.append((tree.find_root(idx), tree.tokens[idx]))

    for idx in tree.find_all_root():

        descendants = tree.find_descendants(idx)
        descendants += [idx]
        tokens = [
            tree.tokens[idx] for idx in descendants
            if tree.tokens[idx].lower() not in nlp.getInstance().stop_list
        ]
        pos_tags = [
            tree.pos_tags[idx] for idx in descendants
            if tree.tokens[idx].lower() not in nlp.getInstance().stop_list
        ]

        if 'V' in pos_tags or 'N' in pos_tags or '^' in pos_tags:
            rst.append((tokens, pos_tags, tree.tokens[idx]))

            # print(tokens, pos_tags, tree.tokens[idx])

    return rst
    def __init__(self, query, graph):
        '''
        Constructor
        '''
        #         self.start_time
        #         self.end_time

        if graph is None:
            self.g = Graph()
            self.table = TweetTable()
        else:
            self.g = graph
            self.table = TweetTable(graph=graph)

        self.nodeQueue = list()

        try:
            self.nlp = nlp.getInstance(query)
        except TypeError:
            print('failed')
            print(query)
            print(traceback.format_exc())

        self.logging = False

        # min edge weight to keep in graph when clustering
        self.min_edge_weight = 3

        #min count of edges in graph
        self.min_edges = 3

        #change clustering mode
        self.clusteringMode = 'community'
示例#3
0
    def __init__(self, query, graph):
        '''
        Constructor
        '''
        #         self.start_time
        #         self.end_time

        if graph is None:
            self.g = Graph()
            self.table = TweetTable()
        else:
            self.g = graph
            self.table = TweetTable(graph=graph)

        self.nodeQueue = list()

        try:
            self.nlp = nlp.getInstance(query)
        except TypeError:
            print('failed')
            print(query)
            print(traceback.format_exc())

        self.logging = False

        # min edge weight to keep in graph when clustering
        self.min_edge_weight = 3

        #min count of edges in graph
        self.min_edges = 3

        #change clustering mode
        self.clusteringMode = 'community'
    def retrieve_phrases(self, toString=False):

        phrases = []

        for idx in self.find_all_root():

            descendants = self.find_descendants(idx)
            descendants += [idx]

            descendants = sorted(descendants)
            #remove stop list;
            descendants = [
                idx for idx in descendants
                if self.tokens[idx].lower() not in nlp.getInstance().stop_list
            ]
            #remove by pos tagging;
            descendants = [
                idx for idx in descendants
                if self.pos_tags[idx] in preserve_pos
            ]

            tokens = [self.tokens[idx] for idx in descendants]
            tags = [self.pos_tags[idx] for idx in descendants]

            if len(tokens) > 0:
                phrases.append(tokens)

        if not toString:
            return phrases
        else:
            phrases = ["_".join(entry) for entry in phrases]
            return " ".join(phrases)
    def __init__(self, token, tag, index):

        token = nlp.getInstance().valid_token(token, tag)

        if token is None:
            return None

        self['token'] = token
        self['tag'] = tag
        self['index'] = index
    def __init__(self, token, tag, index):

        token = nlp.getInstance().valid_token(token, tag)

        if token is None:
            return None

        self['token'] = token
        self['tag'] = tag
        self['index'] = index
示例#7
0
from idlelib.IOBinding import encoding
from Search import SolrSearcher
from Search import TimeFunc
from itertools import groupby
import json
import re
from Basic.AhocSearch import AhocSearch
from NLP.TermFold import TermFold
from NLP import NLPManager
import pandas
from test.test_buffer import _ca

stopwords = open("Data/stopwords_en.txt", encoding="utf-8").read().lower().splitlines()
print(stopwords)

nlp = NLPManager.getInstance()

class EMTerms(object):
    
    corpus_dir = "Data/EMTerms-light.csv"
    tf_threshold = 1
    #23 categories, so tp_theshold does not affect#
    tp_threshold = 30
    keyword_mode = 1 # mode 2 needs pos tagging for each input tweet
    
    class __EMTerms:
        
        def __init__(self):
            
            self.term_db = {}
            self.code_db = {}
def treeify_tweet(tweet):
    tree = ParseTree(tweet['tokens'], tweet['chunk'], tweet['token_tags'], tweet['parser'])
    rst = []

    segment = []
    for idx in range(tree.get_size()):
        segment.append( (tree.find_root(idx), tree.tokens[idx]) )

    for idx in tree.find_all_root():

        descendants = tree.find_descendants(idx)
        descendants += [idx]
        tokens = [ tree.tokens[idx] for idx in descendants if tree.tokens[idx].lower() not in nlp.getInstance().stop_list ]
        pos_tags = [ tree.pos_tags[idx] for idx in descendants if tree.tokens[idx].lower() not in nlp.getInstance().stop_list ]

        if 'V' in pos_tags or 'N' in pos_tags or '^' in pos_tags:
            rst.append((tokens, pos_tags, tree.tokens[idx]))

            # print(tokens, pos_tags, tree.tokens[idx])

    return rst
 def retrieve_phrases(self, toString=False):
 
     phrases = []
     
     for idx in self.find_all_root():
  
             descendants = self.find_descendants(idx)
             descendants += [idx]
              
             descendants = sorted(descendants)
             #remove stop list;
             descendants = [ idx for idx in descendants if self.tokens[idx].lower() not in nlp.getInstance().stop_list ]
             #remove by pos tagging;
             descendants = [ idx for idx in descendants if self.pos_tags[idx] in preserve_pos ]
              
             tokens = [ self.tokens[idx] for idx in descendants ]
             tags = [ self.pos_tags[idx] for idx in descendants ]
             
             if len(tokens) > 0:
                 phrases.append(tokens)
     
     if not toString:
         return phrases
     else:
         phrases = [ "_".join(entry) for entry in phrases ]
         return " ".join(phrases)
 def _gen_forms(self, term):
     return nlp.getInstance().gen_forms(term, self.threshold)
 def _gen_forms(self, term):
     return nlp.getInstance().gen_forms(term, self.threshold)