Exemplo n.º 1
0
def test_chunking_de():
    language='german'
    tt = TreeTagger(language=language, path_to_home=treetagger_path)
    phrase = 'Das Haus hat einen großen hübschen Garten.'
    sentence = tt.tag(phrase)
    cp = TreeTaggerChunker(language=language, path_to_home=treetagger_path)
    print(cp.parse(sentence))
Exemplo n.º 2
0
    def __init__(self, products):
        self.username = ''
        self.prodlist = products
        #self.prodlist_itemoid = list(self.prodlist.keys())
        self.prodlist_itemoid = self.add_itemoid()
        self.request = Counter(self.prodlist.keys())
        self.request.subtract(
            self.request
        )  # sottraggo se stesso cosi da avere un dict con keys = nome prodotti, e value=0

        self.tagger = TreeTagger(language='italian')
        self.converter = Converter()

        self.positive_predicates = ['volere', 'aggiungere']
        self.negative_predicates = [
            'rimuovere', 'togliere', 'cancellare', 'eliminare'
        ]
        self.predicates = self.positive_predicates + self.negative_predicates
        self.completings = ['ok']
        self.terminatings = [
            'fine', 'tutto', 'termina', 'annulla', 'annullare'
        ]
        self.id_err = [
            'riconoscimento', 'identità', 'persona', 'utente', 'sono', 'faccia'
        ]
Exemplo n.º 3
0
def test_chunking_brackets():
    language='english'
    tt = TreeTagger(language=language, path_to_home=treetagger_path)
    phrase = 'What is the airspeed of an (unladen) swallow?'
    sentence = tt.tag(phrase)
    cp = TreeTaggerChunker(language=language, path_to_home=treetagger_path)
    print(cp.parse(sentence))
Exemplo n.º 4
0
    def __init__(self, path_projeto, noticia, legenda, titulo, path, path_dir):
        self.path_projeto = path_projeto
        self.diretorio = path_projeto + path_dir
        self.path_noticia = path_projeto + path
        self.noticia = noticia
        self.noticia_inicial = noticia
        self.legenda = legenda
        self.titulo = titulo
        self.lst_top_substantivos = []
        self.lst_top_entidades_nomeadas = []
        self.total_entidades = 0
        self.tipo_wordNet = ""
        self.lst_EntidadesNomeadas = []
        self.lst_interseccao_entidades = []
        self.lst_top_substantivos_physical = []
        self.lst_top_substantivos_objects = []
        self.lst_diferenca_entidades = []
        self.dict_lematizado = {}
        self.chunker = MWEChunker()

        self.crefdoc = CoreferenceDocument()
        self.snt_tok = []

        PATH_LOCATION = os.path.dirname(os.path.abspath(__file__))
        print(PATH_LOCATION)

        TREE_TAGGER_PATH = PATH_LOCATION + '/TreeTagger'
        print('exportando tree tagger em ', TREE_TAGGER_PATH)
        os.environ["TREETAGGER_HOME"] = TREE_TAGGER_PATH

        self.tree_tagger = TreeTagger(language='english')
Exemplo n.º 5
0
 def get_treetagger_postagger(self, language='en'):
     if not self.__has_key(self.__treetaggers, language):
         dirname, filename = os.path.split(os.path.abspath(__file__))
         self.__treetaggers[language] = TreeTagger(
             language=language,
             path_to_home=os.path.join(dirname, 'treetagger', 'cmd'))
     return self.__treetaggers[language]
Exemplo n.º 6
0
def extract_keywords(raw_text,id,language):

    print("Extracting keywords for "+id)

    stemmer = nltk.PorterStemmer()

    # Construct text

    # Tokens

    if language == 'english':
        tokens = nltk.word_tokenize(raw_text)
        # filter undesirable words and format
        words = [w.replace('\'','') for w in tokens if len(w)>=3]
        text = nltk.Text(words)

        tagged_text = nltk.pos_tag(text)

    else:
       tt = TreeTagger(encoding='utf-8',language='french')
       tagged_text =tt.tag(raw_text.replace('\'',' ').replace(u'\u2019',' ').replace(u'\xab',' ').replace(u'\xbb',' '))

    print(tagged_text)

    # detect language using stop words, adapt filtering/stemming technique in function

    # multi-term
    multiterms = []
    for i in range(len(tagged_text)) :
  #      # max length 4 for multi-terms
        for l in range(1,5) :
            if i+l < len(tagged_text) :
                tags = [tagged_text[k] for k in range(i,i+l)]
                #if language == 'english':
                #        print(tags)
		#	print(potential_multi_term(tags,language))
		if potential_multi_term(tags,language) :
                    multistem = []
		    if language == 'english':
			#print(tags)
			#for k in range(i,i+l):
		        #    print(tagged_text[k][0])
			#    print(stemmer.stem(tagged_text[k][0]))
			#    print(stemmer.stem(tagged_text[k][0]).encode('ascii','ignore'))

			multistem = [str.lower(stemmer.stem(tagged_text[k][0]).encode('utf8','ignore')) for k in range(i,i+l)]
                    else :#in case of french or other language, terms are already stemmed by TreeTagger
			multistem=[]
			for k in range(i,i+l):
			    if tagged_text[k][2]!="<unknown>":
			        stem = tagged_text[k][2]
			    else :
			        stem = tagged_text[k][0]
			    multistem.append(str.lower(stem.encode('utf8','ignore')))
		    #multistem.sort(key=str.lower)
                    multiterms.append(multistem)

    return multiterms
Exemplo n.º 7
0
def tagLang(langs):
    global tagDictMap, taggerMap
    if "fr" in langs:
        frTagDict = convertTags('fr-treetagger.map')
        frTagger = TreeTagger(encoding='utf-8', language='french')
        tagDictMap['fr'] = frTagDict
        taggerMap['fr'] = frTagger
    if "sl" in langs:
        slTagDict = convertTags('sl-treetagger.map')
        slTagger = TreeTagger(encoding='utf-8', language='slovenian')
        tagDictMap['sl'] = slTagDict
        taggerMap['sl'] = slTagger
    if "de" in langs:
        deTagDict = convertTags('de-tiger.map')
        deTagger = TreeTagger(encoding='utf-8', language='german')
        tagDictMap['de'] = deTagDict
        taggerMap['de'] = deTagger
    if "it" in langs:
        itTagDict = convertTags('it-treetagger.map')
        itTagger = TreeTagger(encoding='utf-8', language='italian')
        tagDictMap['it'] = itTagDict
        taggerMap['it'] = itTagger
    if "pl" in langs:
        plTagDict = convertTags('pl-treetagger.map')
        plTagger = TreeTagger(encoding='utf-8', language='polish')
        tagDictMap['pl'] = plTagDict
        taggerMap['pl'] = plTagger
    if "sk" in langs:
        skTagDict = convertTags('sk-treetagger.map')
        skTagger = TreeTagger(encoding='utf-8', language='slovak')
        tagDictMap['sk'] = skTagDict
        taggerMap['sk'] = skTagger
    if "es" in langs:
        esTagDict = convertTags('es-treetagger.map')
        esTagger = TreeTagger(encoding='utf-8', language='spanish')
        tagDictMap['es'] = esTagDict
        taggerMap['es'] = esTagger
    if "nl" in langs:
        nlTagDict = convertTags('nl-treetagger.map')
        nlTagger = TreeTagger(encoding='utf-8', language='dutch')
        tagDictMap['nl'] = nlTagDict
        taggerMap['nl'] = nlTagger
    engTagger = TreeTagger(encoding='utf-8', language='english')
    taggerMap['en'] = engTagger
    engTagDict = convertTags('en-ptb.map')
    tagDictMap['en'] = engTagDict
    output = open('tagged.all', 'w')
    errors = open('errors.txt', 'w')
    numLines = 0
Exemplo n.º 8
0
def pos_tag_sents(tokenized_sents):
    from treetagger import TreeTagger

    tagged_sents = []
    for i, s in enumerate(tokenized_sents):
        tt = TreeTagger(language='german')
        tags = [t for t in tt.tag(s) if len(t) > 1]
        tags = [tuple(tag + ["_", ""]) for tag in tags]
        tagged_sents.append(tags)
    return tagged_sents
def getFeatureVecFromPOS(oneLine, lang, n_gram_range, ngram_vec):
    clean_train_reviews = review_to_words(oneLine, lang)
    tt = TreeTagger(encoding='latin-1', language=lang)
    train_reviews_pos_tags = []

    train_reviews_pos_tags = tt.tag(clean_train_reviews)
    a = [col[1] for col in train_reviews_pos_tags]
    pos_line = " ".join(a)
    X = ngram_vec.transform(pos_line).toarray()
    return X
Exemplo n.º 10
0
def login():

    text = request.form["text"]

    verb = "empty"
    verb_added = False
    noun = "empty"
    noun_added = False
    arguments = []
    command = "empty"
    encoded_tags = []
    encoded_text_with_tags = []
    plural = False
    reload(sys)
    sys.setdefaultencoding('utf8')
    tt_pl = TreeTagger(
        path_to_treetagger='/Users/kasper/Development/Python/SmartTerminal/',
        language='polish')
    tags = tt_pl.tag(text)
    for tag in tags:

        if set(tag[1].split(":")) & set(data.VERB):
            verb = tag[2]
            verb_added = True
        elif verb_added and not set(tag[1].split(":")) & set(data.NOT_NOUN):
            if "pl" in tag[1]:
                plural = True
            noun = tag[2]
            noun_added = True
            verb_added = False
        elif noun_added and not set(tag[1].split(":")) & set(data.NOT_NOUN):
            arguments.append(tag[0])

    if plural:
        for argument in arguments:
            command = data.actions[verb][noun] + argument
            os.system(command)

    else:
        if not len(arguments) == 0:
            command = data.actions[verb][noun] + arguments[0]
        else:
            command = data.actions[verb][noun]
        os.system(command)

    if text:
        return jsonify({'result': 'OK'})
    else:
        return jsonify({'result': 'BAD'})
Exemplo n.º 11
0
def lemmatize_data(data):
    tt = TreeTagger(language='english')
    lemm_data = {"abstract": []}
    count = 0
    for index, row in data.iterrows():
        print(count)
        count += 1
        abstract = ''
        if len(row['abstract']) != 0:
            for word, _, lemma in tt.tag(row['abstract']):
                if lemma == '<unknown>':
                    abstract += word + ' '
                elif "|" in lemma:
                    parts = lemma.split("|")
                    abstract += min(parts, key=len) + ' '
                else:
                    abstract += lemma + ' '
        lemm_data['abstract'].append(abstract)
    return pd.DataFrame(lemm_data)
    def getOtherError(self):

        tt = TreeTagger(language='english')
        #getting the Stanford parse list
        testSentence = " ".join(self.words)
        StanParserLst = []
        StanParserLst = self.synt
        outLst = tt.tag(testSentence)
        tagLst = []

        #aligning the treetagger POS tags to match the stanford parser tags and adding changed tags to tagLst
        tagChangeDic = {
            'NP': 'NNP',
            'NPS': 'NNPS',
            'PP': 'PRP',
            'SENT': '.',
            '(': '-LRB-',
            ')': '-RRB-'
        }

        for word, tag, form in outLst:
            if tag in tagChangeDic.keys():
                tagLst.append(tagChangeDic.get(tag))
            else:
                tagLst.append(tag)

        #checking at each index if there is a mismatch in tagging between stanford parser and treetagger tags
        if len(self.synt) == len(tagLst):
            SPlst = []
            TTlst = []
            i = 0
            while (i < len(self.synt)):

                if self.synt[i] != tagLst[i]:

                    objOthererr = errDef.ErrorDef("OTHER")
                    retVal = objOthererr.checkOther(i, self.synt[i], self)
                    print(retVal)
                    if retVal != 0:
                        self.sentDetailobj.objLstProblems.AddToProblemListTypewise(
                            "OTHER", retVal)
                i += 1
Exemplo n.º 13
0
    def getPOS(self, text):

        tt = TreeTagger(language='polish')
        sentences = tt.tag(text)

        pos_sentences = []
        sent = []
        for w in sentences:
            if len(w) < 3:
                continue
            if w[1].find(':') == -1:
                tag = w[1]
            else:
                tag = w[1].split(':')[0]
            if tag == 'SENT':
                pos_sentences.append(sent)
                sent = []
            else:
                sent += [tag]
        self.pos_sentences = pos_sentences

        return self.pos_sentences
Exemplo n.º 14
0
def tokenize(text):
	'''
		Tokenize and lemmatize a text using tree tagger

		Arguments
			- text: string containing the the text
		
		Returns
			- tokens: list of all the lemmatized tokens

		NOTE : tree tagger tag function returns
		a tuple (token, tag, lemmatized_token)
	'''
	tree = TreeTagger()

	text = text.lower()

	tokens = [t[2] for t in tree.tag(text) if len(t) == 3 and t[2] != '<unknown>' and t[1] != 'NUM' and t[1] != 'PUN']
	tokens = [t for t in tokens if len(t) > 2]
	tokens = [t for t in tokens if t not in STOPWORDS]

	return tokens
def getFeatureVecFromPOS(fileName, lang, n_gram_range):
    train = pd.read_csv(fileName, header=0, delimiter="\t", quoting=1)
    num_text = train["text"].size

    clean_train_reviews = []
    # 	print "Looping through all text.\n"

    for i in xrange(0, num_text):
        clean_train_reviews.append(review_to_words(train["text"][i], lang))

    tt = TreeTagger(encoding='latin-1', language=lang)
    train_reviews_pos_tags = []

    for line in clean_train_reviews:
        a = tt.tag(line)
        a = [col[1] for col in a]
        pos_line = " ".join(a)
        train_reviews_pos_tags.append(pos_line)

    ngram_vectorizer = CountVectorizer(ngram_range=n_gram_range, min_df=1)
    X = ngram_vectorizer.fit_transform(train_reviews_pos_tags).toarray()
    return X, ngram_vectorizer
Exemplo n.º 16
0
def postags(data, text_column="reponse", lang="french"):
    """
    Return TreeTagger Part-Of-Speech outputs for large corpus. 
    
    Parameters
    ----------
    texts : list of str
        corpus
    lang : str, optional
        {french, spanish, english, ...}, by default "french"
    nb_split : int, optional
        number of text send to TreeTagger at each loop, by default 50
    """
    tree_tagger = TreeTagger(language=lang)
    res = tree_tagger.tag("\n##############END\n".join(
        data[text_column].values))
    res = [r for r in res if len(r) == 3]
    res = np.asarray(res)
    indexes = np.where(res[:, 0] == "##############END")[0]
    pos_tag_data = np.asarray(
        [parse_output(i[1:]) for i in np.split(res, indexes)])
    data["pos_tag"] = pos_tag_data
    return data
Exemplo n.º 17
0
def lemmatize_data(data):
    tt = TreeTagger(language='english')
    res = []
    count = 0
    for index, row in data.iterrows():
        count += 1
        eprint(count)
        abstract = []
        for elem in tt.tag(row[' Abstract ']):
            lemma = elem[2]
            if lemma == '<unknown>':
                abstract.append(elem[0])
            elif len(lemma.split("|")) == 2:
                parts = lemma.split("|")
                if len(parts[0]) < len(parts[1]):
                    abstract.append(parts[0])
                else:
                    abstract.append(parts[1])
            else:
                abstract.append(lemma)
        if len(abstract) > 0:
            res.append(' '.join(word for word in abstract))
    return res
Exemplo n.º 18
0
 def test_language(language, phrase):
     dirname, filename = os.path.split(os.path.abspath(__file__))
     tt = TreeTagger(language=language,
                     path_to_home=os.path.join(dirname, 'treetagger', 'cmd'))
     return tt.tag(phrase)
Exemplo n.º 19
0
# python - treetagger interface
from treetagger import TreeTagger

tt = TreeTagger(language='french')


def clean_tag(tag):
    if len(tag) != 3:
        return None
    word, pos, lemma = tag
    if lemma == '<unknown>':
        return {'word': word, 'lemma': word, 'pos': 'UNKNOWN'}
    elif pos == 'NOM':
        return {'word': word, 'lemma': lemma, 'pos': 'NOUN'}
    elif pos.startswith('VER'):
        return {'word': word, 'lemma': lemma, 'pos': 'VERB'}
    elif pos.startswith('ADJ'):
        return {'word': word, 'lemma': lemma, 'pos': 'ADJ'}
    return None


def tag_text(text):
    text = text.replace("'", " ")
    raw_tags = tt.tag(text)

    clean_tags = []
    for tag in raw_tags:
        tag = clean_tag(tag)
        if tag is not None:
            clean_tags.append(tag)
    return clean_tags
Exemplo n.º 20
0
n_top_words = 30

################################################################################

if len(sys.argv) == 2:
    n_topics = int(sys.argv[1])
elif len(sys.argv) > 2:
    n_topics = [int(sys.argv[i]) for i in range(1, len(sys.argv))]

cv = CountVectorizer(stop_words=set(list(stopwords) + utils.manual_stopwords),
                     analyzer="word",
                     min_df=min_df_cv,
                     max_df=max_df_cv,
                     max_features=n_features)
analyzer = cv.build_analyzer()
tt = TreeTagger(language='english',
                path_to_treetagger="tree-tagger-linux-3.2.2/")

if not use_preprocessed_data:
    print("reading data")
    df = pd.read_excel(file_path, usecols=[13, 24], names=['id', 'abstract'])

    print("preprocessing data")
    df = utils.preprocess_data(df, analyzer, tt)
    df.to_csv("data/tesi_US_preprocessed.csv", index=None)
else:
    print("loading preprocessed data")
    df = pd.read_csv("data/tesi_US_preprocessed.csv")

print("training vectorizer")
TDmat = cv.fit_transform(df['preprocessed'])
joblib.dump(cv, "models/cv_{}.pkl".format(n_features))
Exemplo n.º 21
0
def computeSemanticSimilarityFeatures(sentence1, sentence2):
    features = [0] * 9

    if (sentence1 + sentence2) not in semanticsimilarity_lookuptable:
        def prepareSentence(sentence):
            return sentence.replace('-', ' ').replace('$', ' ')

        tt = TreeTagger(language='english')
        tags1 = [a for a in tt.tag(prepareSentence(sentence1)) if len(a) > 1]
        tags2 = [a for a in tt.tag(prepareSentence(sentence2)) if len(a) > 1]

        semanticsimilarity_lookuptable[sentence1 + sentence2] = [tags1, tags2]

    tags1 = copy.deepcopy(semanticsimilarity_lookuptable[sentence1 + sentence2][0])
    tags2 = copy.deepcopy(semanticsimilarity_lookuptable[sentence1 + sentence2][1])

    # Feature: noun/web semantic similarity
    # Get Synonym set
    def synSet(tags):
        for word in tags:
            # Only compare Nouns or Verbs
            # Python does not have short circuit operators, wtf?!
            if (word[1][0] != 'N' if len(word[1]) >= 1 else 1) and (word[1][:2] != 'VV' if len(word[1]) >= 2 else 1):
                continue

            word.append(wordnet.synsets(word[2]))

    synSet(tags=tags1)
    synSet(tags=tags2)

    simsMaxNoun = []
    simsAvgNoun = []
    simsMaxVerb = []
    simsAvgVerb = []

    for word1, word2 in product(tags1, tags2):
        type1 = word1[1]
        type2 = word2[1]

        if (type1[0] != 'N' and type1[:2] != 'VV') or type1 != type2:
            continue

        similarityMax = 0
        similarityAvg = 0
        if word1[2] == word2[2]:
            similarityAvg = 1
            similarityMax = 1
        else:
            for sense1, sense2 in product(word1[3], word2[3]):
                sim = wordnet.wup_similarity(sense1, sense2)
                similarityMax = max(similarityMax, sim)
                similarityAvg += sim if sim is not None else 0

        if type1[0] != 'N':
            simsMaxNoun.append(similarityMax)
            simsAvgNoun.append(similarityAvg / (len(word1[3]) + len(word2[3])) if len(word1[3]) + len(word2[3]) > 0 else 0)
        else:
            simsMaxVerb.append(similarityMax)
            simsAvgVerb.append(similarityAvg / (len(word1[3]) + len(word2[3])) if len(word1[3]) + len(word2[3]) > 0 else 0)


    features[0] = np.sum(simsMaxNoun) / len(simsMaxNoun) if len(simsMaxNoun) > 0 else 0
    features[1] = np.sum(simsAvgNoun) / len(simsAvgNoun) if len(simsAvgNoun) > 0 else 0

    features[2] = np.sum(simsMaxVerb) / len(simsMaxVerb) if len(simsMaxVerb) > 0 else 0
    features[3] = np.sum(simsAvgVerb) / len(simsAvgVerb) if len(simsAvgVerb) > 0 else 0

    # Feature: Cardinal number similarity
    def findCardinals(tags):
        cardinals = []
        for index, word1 in enumerate(tags):
            if word1[1] == 'CD':
                # is "more", "over" or "above" before?
                before = [a[0] for a in tags[max(index-2, 0):index]]

                try:
                    val = float(word1[0])
                except ValueError:
                    val = t2i.text2int(word1[0])

                maxValue = minValue = val

                if ("more" in before) or ("over" in before) or ("above" in before) or ("greater" in before):
                    maxValue = sys.maxint
                    minValue += 1
                elif ("less" in before) or ("under" in before) or ("below" in before) or ("smaller" in before):
                    minValue = -sys.maxint - 1
                    maxValue -= 1

                cardinals.append([minValue, maxValue])
        return cardinals

    cardinals1 = findCardinals(tags=tags1)
    cardinals2 = findCardinals(tags=tags2)

    def countCDMatches(cardinals1, cardinals2):
        count = 0
        for cd1 in cardinals1:
            for cd2 in cardinals2:
                if cd1[0] == cd2[0] and cd1[1] == cd2[1]:
                    count += 1
                    break
        return count

    features[4] = (countCDMatches(cardinals1, cardinals2) + countCDMatches(cardinals2, cardinals1)) / (len(cardinals1) + len(cardinals2)) if len(cardinals1) + len(cardinals2) > 0 else 1
    #features[2] = countCDMatches(cardinals1, cardinals2) / len(cardinals1) if len(cardinals1) > 0 else 1
    #features[3] = countCDMatches(cardinals2, cardinals1) / len(cardinals2) if len(cardinals2) > 0 else 1


    # Feature: Proper Name
    def findProperNouns(tags):
        nouns = []
        for word in tags:
            if word[1] == 'NPS':
                nouns.append(word[0])
        return nouns

    def countNounMatches(nouns1, nouns2):
        count = 0
        for noun1 in nouns1:
            for noun2 in nouns2:
                if noun1 == noun2:
                    count += 1
                    break
        return count

    nouns1 = findProperNouns(tags1)
    nouns2 = findProperNouns(tags2)

    features[5] = (countNounMatches(nouns1, nouns2) + countNounMatches(nouns2, nouns1)) / (len(nouns1) + len(nouns2)) if len(nouns1) + len(nouns2) > 0 else 1
    # features[4] = countNounMatches(nouns1, nouns2) / len(nouns1) if len(nouns1) > 0 else 1
    # features[5] = countNounMatches(nouns2, nouns1) / len(nouns2) if len(nouns2) > 0 else 1

    # Feature: Word2Vec (all)
    meaning1 = np.zeros(model.vectors.shape[1])
    for word in tags1:
        if word[2] in model:
            meaning1 += model[word[2]]

    meaning2 = np.zeros(model.vectors.shape[1])
    for word in tags2:
        if word[2] in model:
            meaning2 += model[word[2]]

    diffMeaning = meaning1 - meaning2
    features[6] = np.linalg.norm(diffMeaning)
    features[7] = scipy.spatial.distance.cosine(meaning1, meaning2)

    similarityMatrix = [0] * len(tags1)
    for index1, word1 in enumerate(tags1):
        row = [0]*len(tags2)
        for index2, word2 in enumerate(tags2):
            similarityMax = 0
            if len(word1) > 3 and len(word2) > 3:
                for sense1, sense2 in product(word1[3], word2[3]):
                    sim = wordnet.wup_similarity(sense1, sense2)
                    similarityMax = max(similarityMax, sim)
                similarityMax = 1 - similarityMax
            else:
                similarityMax = 1

            row[index2] = similarityMax
        similarityMatrix[index1] = row
    m = Munkres()
    totalCost = 0
    indices = m.compute(similarityMatrix)
    for row, column in indices:
        totalCost += similarityMatrix[row][column]

    features[8] = totalCost / len(indices)

    return features
Exemplo n.º 22
0
def get_data():
    debTime = time.time()
    df = pd.read_csv(data_path + 'csvTweets.csv')
    cols_to_keep = ['sentiment', 'longitude', 'latitude', 'text']
    df_clean = df[cols_to_keep].dropna()
    lista = []
    j = 0
    liststop = []
    listval = []
    while j < len(df.text):
        tt = TreeTagger(language='english')
        lis = tt.tag(str(df.text[j]).lower())
        for i in lis:
            if ('NN' in i[1]) or ('NP' in i[1]) or ('JJ' in i[1]) or (
                    'VB' in i[1]) or ('#' in i[0]) and (len(i[0]) > 3):
                if i[0] not in stop:
                    if "#" in i[0]:
                        i[0] = i[0].replace("#", "")
                    if "." in i[0]:
                        i[0] = i[0].replace(".", "")
                    lis = []
                    if "-" in i[0]:
                        lis = i[0].split("-")
                        if "" in lis:
                            lis.remove("")
                        i[0] = ' '.join(lis)
                    if "_" in i[0]:
                        lis = i[0].split("_")
                        if "" in lis:
                            lis.remove("")
                        i[0] = ' '.join(lis)
                    inc = 1
                    for letter in i[0]:
                        if not letter in "abcdefghijklmnopqrstuvwxyz ":
                            inc = 0
                    if (inc != 0) and (len(i[0]) > 3):
                        if i[0] in liststop:
                            listval[liststop.index(
                                i[0])] = listval[liststop.index(i[0])] + 1
                        else:
                            listval.append(1)
                            liststop.append(i[0])
        j = j + 1
    js = []
    j = 0
    lenval = len(listval)
    pod = 0
    kickval = ""
    kickstop = ""
    while j < lenval:
        pod = listval.index(max(listval))
        kickval = listval.pop(pod)
        kickstop = liststop.pop(pod)
        js = [kickstop, kickval]
        listm.append(js)
        j = j + 1
    duree = time.time() - debTime
    print "\n"
    print "   @--- Overhead /data socket : "+\
    str(duree)+" seconds ---@"
    print "\n"
    return df_clean.to_json(orient='records')
Exemplo n.º 23
0
    def envoiTweets(self):
        print "@----Envoi des tweets--@"
        cursor = conn.cursor()
        lastId = "0"
        query = "SELECT * FROM tweetAnalyser WHERE temps > (now() - interval 20 second - interval 2 hour) and tweetId > " + lastId
        query50 = "SELECT DISTINCT mot, COUNT(tweetId) AS frequency FROM linkKeywordTweet GROUP BY mot ORDER BY frequency DESC LIMIT 10"
        while not thread_stop_event.isSet():
            cursor.execute(query)
            dataMysql = cursor.fetchall()
            cursor.execute(query50)
            dataMysql50 = cursor.fetchall()
            listmotkey = []
            for elt in dataMysql50:
                coooo = (elt[0], elt[1])
                listmotkey.append(coooo)
            liste = []
            for elt in dataMysql:
                texte = elt[1][2:][:len(elt[1]) - 4]
                new = (texte, elt[2], elt[3], json.dumps(elt[4]))  #,elt[5])
                socketio.emit('reponse', {
                    'motsCles': listmotkey,
                    'tweets': [new]
                })
                liste.append(new)
                idt = elt[0]
                if lastId <= str(idt):
                    lastId = str(idt)
                s = elt[1].split()
                #				print idt
                tt = TreeTagger(language='english')
                lis = []
                lis = tt.tag(s)
                nostop = []
                #				print lis
                for k in lis:
                    if k[0] not in stop:
                        if ('NN' in k[1]) or ('NP' in k[1]) or (
                                'JJ' in k[1]) or ('VB' in k[1]) or (
                                    '#' in k[0]) and (len(k[0]) > 3):
                            #						print k[1]
                            #						print k [0]
                            motcle = k[0]
                            if "#" in k[0]:
                                motcle = motcle.replace("#", "")
                            if "." in k[0]:
                                motcle = motcle.replace(".", "")
                            motcle = motcle.lower()
                            lis = []
                            if "-" in motcle:
                                lis = motcle.split("-")
                                if "" in lis:
                                    lis.remove("")
                                motcle = ' '.join(lis)
                            if "_" in motcle:
                                lis = motcle.split("_")
                                if "" in lis:
                                    lis.remove("")
                                motcle = ' '.join(lis)
                            inc = 1
                            for letter in motcle:
                                if not letter in "abcdefghijklmnopqrstuvwxyz ":
                                    inc = 0
                            if (inc != 0) and (len(motcle) > 3):
                                nostop.append(motcle)

                            #print motcle

                for mot in nostop:
                    query1 = """INSERT INTO keyword (mot) VALUES (%s)"""
                    query2 = """INSERT INTO linkKeywordTweet (tweetId,mot) VALUES (%s,%s)"""
                    query3 = "select * from keyword where mot = (%s)"
                    query4 = "select * from linkKeywordTweet  where mot = (%s) and tweetId = (%s)"

                    cur = conn.cursor()
                    cur.execute(query3, (str(mot)))
                    dataMysql3 = cur.fetchall()

                    s = str(mot)
                    if len(s) < 31:

                        if dataMysql3 == ():
                            cur.execute(query1, (str(mot)))
                        #else:
                        #	print ' that shit existe so no insert into keyword'
                        cur.execute(query4, (str(mot), str(idt)))
                        dataMysql4 = cur.fetchall()
                        if dataMysql4 == ():
                            #	print str(mot)
                            #	print str(idt)
                            #print s + '------ add it '
                            cur.execute(query2, (str(idt), str(mot)))
                        #else:
                        #	print ' that shit existe so no insert into linkKeywordTweet'
                    #else :
                    #print s + '------ couldnt add it because of reasons '
                    conn.commit()
                nostop = []
            conn.commit()
Exemplo n.º 24
0
#! /usr/bin/python

from treetagger import TreeTagger
from collections import defaultdict

import csv
import re
import json
import sys
import random
import math

k = 8
saco_de_gato = {}
tt_pt = TreeTagger(language = 'portuguese2')
palavra_morfologia = {}
pattern = re.compile("(^PUNCT.*$|^AUX.*$|^PRON.*$|^DET.*$|^ADP.*$|^SCONJ.*$)")

def sort_second(value):
    return value[1]

# Ler csv
# ID;PERGUNTAS;RESPOSTAS;CLASSES;;
with open(sys.argv[1], 'r', encoding='utf-8', newline='') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter = ';')
    for row in csv_reader:
        pergunta = {}
        pergunta['id'] = row[0].strip()
        pergunta['pergunta'] = row[1].strip()
        pergunta['resposta'] = row[2].strip()
        pergunta['classe'] = row[3].strip()
Exemplo n.º 25
0
def tags(raw):
    tagger = TreeTagger(language='portuguese')
    return tagger.tag(raw.lower())
Exemplo n.º 26
0
#encoding: utf-8
from treetagger import TreeTagger
tt = TreeTagger(language='english', encoding='latin-1')
tagged_sent = tt.tag(
    'What is the airspeed of an unladen swallow? And what about the € sign?')
print tagged_sent
Exemplo n.º 27
0
def normalize(sentence_to_normalize):
    print_coloured_bold('Sentence to stem:',"green")
    print(sentence_to_normalize + '\n')

    #removing m-dash
    sentence_to_normalize = sentence_to_normalize.replace("–"," ").lower()
    sentence_to_normalize = re.sub("-{2,}","",sentence_to_normalize)

    #removing contract forms
    if("'t" in sentence_to_normalize):
        sentence_to_normalize = sentence_to_normalize.replace("'t","")

    #removing specifications inside parenthesis
    start = sentence_to_normalize.find( '(' )
    end = sentence_to_normalize.find( ')' )
    if start != -1 and end != -1:
      sentence_to_normalize = sentence_to_normalize.replace(sentence_to_normalize[start:end+1],"")

    #tokenization
    word_tokens = word_tokenize(sentence_to_normalize)

    #punctuation removal
    word_tokens_filtered = [w for w in word_tokens if not w in punctuation and not w=="'s"]

    #skip if punctuation within words (except -./) or split if / within word
    word_tokens_noslash = list()
    for w in word_tokens_filtered:
        if any(char in punctuation.replace("-","").replace(".","").replace("/","") for char in w):
            return False
        if "/" in w:
            words = w.split("/")
            for split in words:
                if not split == "":
                    word_tokens_noslash.append(split)
        else:
            word_tokens_noslash.append(w)

    #leave acronyms and split others in case of .
    word_tokens_dot = list()
    regex = re.compile('(?:[a-z]\.){2,}')
    for w in word_tokens_noslash:
        if(w+"." in sentence_to_normalize and regex.match(w+".")):
            word_tokens_dot.append(w)
        elif("." in w):
            words = w.split(".")
            for split in words:
                if not split == "":
                    word_tokens_dot.append(split)
        else:
            word_tokens_dot.append(w)

    #stopwords removal (done before stemming, less words to stem)
    stop_words = set(stopwords.words('english'))
    no_stopwords_sentence = [w for w in word_tokens_dot if not w in stop_words]

    #digits removal
    sentence_words_nodigits = [w for w in no_stopwords_sentence if not w.isdigit()]

    #roman numerals removal
    regex = re.compile('^(?=[MDCLXVI])M*D?C{0,4}L?X{0,4}V?I{0,4}$')
    no_roman_numerals_sentence = [w for w in sentence_words_nodigits if not regex.match(w)]

    #one letter words removal
    sentence_words_nosingleletters = [w for w in no_roman_numerals_sentence if not len(w)<2]
    #print('No one letter words:')

    #stemming
    stemmed_sentence = ""
    stemmer = TreeTagger(path_to_treetagger='/home/biar/Desktop/ProgettoWIR/treetagger')
    for word in sentence_words_nosingleletters:
        stem = stemmer.tag(word)
        if not(stem[0][1] == "CRD"):
            if not stem[0][2] == '<unknown>':
                if '|' in stem[0][2]:
                    first_word = ((stem[0][2]).split('|'))[0]
                    stem[0][2] = first_word
                    if(len(first_word)>1):
                        stemmed_sentence += (correct_stemming(stem).lower() + " ")
                else:
                    if(len((stem[0][2]).lower())>1):
                        stemmed_sentence += (correct_stemming(stem).lower() + " ")
            else:
                stemmed_sentence += ((stem[0][0]).lower() + " ")

    print_coloured_bold('Stemmed sentence:',"yellow")
    print(stemmed_sentence.strip())
    print('\n')
    return stemmed_sentence.strip()
Exemplo n.º 28
0
# Read the rules from the file
rules = []
for line in open("dict/rules.tsv", "r"):
    line = line.replace("\n", "")
    line = line.replace("\r", "")
    rules.append(line)

# Build the parser, and parse rules
parser = yacc.yacc()
print("\nLoading rules:")
for rule in rules:
    result = parser.parse(rule)
    print("\tRule parsed succesfully: %s" % rule)

tt = TreeTagger(encoding='latin-1', language=language)
sentiwordnet = SentiWordnet()
to_wordnet = TreetaggerToWordnet()

# Init Tornado web server
application = tornado.web.Application([
    (r"/", MainHandler,
     dict(rules=compiled_rules,
          to_wordnet=to_wordnet,
          sentiwordnet=sentiwordnet,
          tt=tt,
          stopwords=stopwords,
          chunks=chunks,
          language=language)),
])
Exemplo n.º 29
0
 def test_language(language, phrase):
     tt = TreeTagger(language=language, path_to_home=treetagger_path)
     return tt.tag(phrase)
Exemplo n.º 30
0
def tagLang(langs, corpus):
    if "fr" in langs:
        frTagDict = convertTags('fr-treetagger.map')
        frTagger = TreeTagger(encoding='utf-8', language='french')
    if "sl" in langs:
        slTagDict = convertTags('sl-treetagger.map')
        slTagger = TreeTagger(encoding='utf-8', language='slovenian')
    if "de" in langs:
        deTagDict = convertTags('de-tiger.map')
        deTagger = TreeTagger(encoding='utf-8', language='german')
    if "it" in langs:
        itTagDict = convertTags('it-treetagger.map')
        itTagger = TreeTagger(encoding='utf-8', language='italian')
    if "pl" in langs:
        plTagDict = convertTags('pl-treetagger.map')
        plTagger = TreeTagger(encoding='utf-8', language='polish')
    if "sk" in langs:
        skTagDict = convertTags('sk-treetagger.map')
        skTagger = TreeTagger(encoding='utf-8', language='slovak')
    if "es" in langs:
        esTagDict = convertTags('es-treetagger.map')
        esTagger = TreeTagger(encoding='utf-8', language='spanish')
    if "nl" in langs:
        nlTagDict = convertTags('nl-treetagger.map')
        nlTagger = TreeTagger(encoding='utf-8', language='dutch')
    engTagger = TreeTagger(encoding='utf-8', language='english')
    engTagDict = convertTags('en-ptb.map')
    corpus = open(corpus, 'r')
    output = open('tagged.all', 'w')
    errors = open('errors.txt', 'w')
    numLines = 0

    for line in corpus:
        #numLines += 1
        #print numLines
        if line.startswith("<fr>"):
            splitline = line.split(" ")
            notag = " ".join(splitline[1:])
            tagWrite(notag, frTagDict, frTagger, "fr", output, errors)
        elif line.startswith("<sl>"):
            splitline = line.split(" ")
            notag = " ".join(splitline[1:])
            tagWrite(notag, slTagDict, slTagger, "sl", output, errors)
        elif line.startswith("<de>"):
            splitline = line.split(" ")
            notag = " ".join(splitline[1:])
            tagWrite(notag, deTagDict, deTagger, "de", output, errors)
        elif line.startswith("<it>"):
            splitline = line.split(" ")
            notag = " ".join(splitline[1:])
            tagWrite(notag, itTagDict, itTagger, "it", output, errors)
        elif line.startswith("<pl>"):
            splitline = line.split(" ")
            notag = " ".join(splitline[1:])
            tagWrite(notag, plTagDict, plTagger, "pl", output, errors)
        elif line.startswith("<sk>"):
            splitline = line.split(" ")
            notag = " ".join(splitline[1:])
            tagWrite(notag, skTagDict, skTagger, "sk", output, errors)
        elif line.startswith("<es>"):
            splitline = line.split(" ")
            notag = " ".join(splitline[1:])
            tagWrite(notag, esTagDict, esTagger, "es", output, errors)
        elif line.startswith("<nl>"):
            splitline = line.split(" ")
            notag = " ".join(splitline[1:])
            tagWrite(notag, nlTagDict, nlTagger, "nl", output, errors)
        elif line.startswith("-1") or line[0].isdigit():
            splitline = line.split(",")
            output.write("\t".join(splitline))
        elif line.startswith("!@#$%^&*()"):
            output.write(line)
        else:
            tagWrite(line, engTagDict, engTagger, "en", output, errors)