Exemplo n.º 1
0
def main():
    Eng_side = 1
    source_data = codecs.open(sys.argv[1], 'r', encoding='utf-8').readlines()
    target_data = codecs.open(sys.argv[2], 'r', encoding='utf-8').readlines()
    #
    align = []
    for index, sent in enumerate(target_data):
        target_data[index], tmp = sent.split(" ||| ")
        #
        tmp = tmp.split()
        tmp = [int(a.split("-")[1]) for a in tmp]
        align.append(tmp)
    print ("S:", source_data[:5])
    print ("T:", target_data[:5])
    print (align[:5])
    if Eng_side == 0:
        for sent in source_data:
            text = Text(sent)
            print (text.pos_tags)
            sys.exit()
    else:
        for sent in target_data:
            text = Text(sent)
            Noun_index = []
            index = 0
            for word, POS in text.pos_tags:
                if POS == "NOUN":
                    Noun_index.append(index)
                index += 1
            print (Noun_index)

            sys.exit()
Exemplo n.º 2
0
def polyglot_entities(fileids=None, section=None, corpus=kddcorpus):
    """
    Extract entities from each file using polyglot
    """
    results = defaultdict(lambda: defaultdict(list))
    fileids = fileids or corpus.fileids()

    for fileid in fileids:
        if section is not None:
            text = Text((list(sectpull([fileid], section=section))[0][1]))
        else:
            text = Text(corpus.raw(fileid))

        for entity in text.entities:
            etext = " ".join(entity)

            if entity.tag == 'I-PER':
                key = 'persons'
            elif entity.tag == 'I-ORG':
                key = 'organizations'
            elif entity.tag == 'I-locations':
                key = 'locations'
            else:
                key = 'other'

            results[fileid][key].append(etext)

    return results
 def extractEntities(self, text=None, filePath=None):
     if text and filePath:
         self.entity_logger.error(
             "Fatal error: text (string) and file-input cannot be input at the same time. Undefined behavior"
         )
         sys.exit(0)
     if text:
         self.text = text
         self.text.replace("-", " - ")
         self.text.replace("-", " ")
         self.entities = Text(self.text, hint_language_code='no').entities
         self.entity_logger.info(
             message="Ekstraksjon av entiteter gjennomfort fra tekst-string"
         )
     if filePath:
         if os.path.isfile(filePath):
             with open(filePath) as f:
                 self.text = f.read()
             self.text.replace("-", " - ")
             self.entities = Text(self.text,
                                  hint_language_code='no').entities
             self.entity_logger.info(
                 "Entities extracted from {}".format(filePath))
             self.entity_logger.info(
                 "Following entitites: {} was extracted from file: {}".
                 format(self.entities, filePath))
     self.formatEntities()
Exemplo n.º 4
0
def get_named_entities(text, lang="tr"):
    ptext = Text(text, hint_language_code=lang)  # output can be re-organised
    sentences = ptext.sentences
    entities = []
    for sentence in sentences:
        lang_sentence = Text(str(sentence), hint_language_code=lang)
        s_entities = [(" ".join(entity), entity.tag) for entity in lang_sentence.entities]
        entities.extend(s_entities)
    
    return entities
Exemplo n.º 5
0
def simplify(text, tips):
    if len(text.strip()) == 0 or text is None:
        return
    pgText = Text(text)
    pgText.hint_language_code = 'fr'
    pgText.pos_tags

    for word, pos in pgText.pos_tags:
        startpos = text.index(word)
        if pos == u'PRON':
            new_word = replace_propn(word)
            if new_word:
                print(word)
                index = len(warnings)
                warnings.append(Warning(index, startpos, startpos+len(word),
                                        "Utiliser mot simple", new_word))
                # tips.append(Tip(C_COMPLEX_WORD, startpos, startpos + len(word), word))
        elif pos == u'VERB':
            if word.startswith("l'") or word.startswith("d'"):
                verb = word.split("'")[1]
            else:
                verb = word
            new_word = replace_verb(verb)
            if new_word:
                index = len(warnings)
                warnings.append(
                    Warning(
                        index,
                        startpos,
                        startpos + len(word),
                        "Utiliser:",
                        lemma[verb] + ' -> ' + new_word
                        )
                    )
    for sentence in Text(text).sentences:
        sentstr = str(sentence)
        if sentstr.count(',') >= 1:
            idofcomma = sentstr.index(',')
            startidx = sentence.start-1+idofcomma
            index = len(warnings)
#           TODO: change message to French
            warnings.append(
                Warning(
                    index,
                    startidx,
                    startidx + 5,
                    "Avoid clauses",
                    "Delete or use seperate sentences"
                    )
                )
Exemplo n.º 6
0
def entity_strings(article):
    try:
        curr_entities = Text(article).entities
    except:
        #This is neccessary because polyglot has problems with some UTF-8 characters
        printable = ''.join(x for x in article if x.isprintable())
        curr_entities = Text(printable).entities

    entity_strings = []
    for entity in curr_entities:
        entity_str = extract_chunk_string(entity,article)
        entity_strings.append(entity_str)

    return entity_strings
Exemplo n.º 7
0
def test_detection(test1, test2): 
    """
    This is for testing purposes only.
    """
    ld = detect(test1) 
    print("langdetect", ld) 
    ld = detect(test2) 
    print("langdetect", ld) 
    text = Text(test1)
    pg = text.language.code  
    print("polyglot", pg) 
    text = Text(test2)
    pg = text.language.code  
    print("polyglot", pg) 
Exemplo n.º 8
0
 def tokenize(self, caption, lang, b_filter_url=True, b_filter_special=True, b_remove_stop=True, b_unique=True):
     caption = re.sub('[\s]', ' ', caption.lower(), flags=re.UNICODE)
     if lang=='en':
         caption = re.sub('[^\w\s@#]','',caption,flags=re.UNICODE)
         tokens = filter(lambda x: len(x) > 2, caption.strip().split(' '))
         if b_filter_special:
             tokens = filter(lambda x: self.is_special(x) is not True, tokens)
         if b_filter_url:
             tokens = filter(lambda x: self.is_url(x) is not True, tokens)
         if b_remove_stop:
             tokens = filter(lambda x: x not in self.stop[lang], tokens)
         if b_unique:
             return list(set(tokens))
         return tokens
     elif lang=='ar' or lang=='ru':
         try:
             if b_filter_special:
                 caption = ' '.join(filter(lambda x: self.is_special(x)==False, caption.split(' ')))
             tokens = filter(lambda x: len(x)>1, Text(caption).words)
             if b_filter_url:
                 tokens = filter(lambda x: self.is_url(x) is not True, tokens)
             if b_remove_stop:
                 tokens = filter(lambda x: x not in self.stop[lang], tokens)
             if b_unique:
                 return list(set(tokens))
             return tokens
         except:
             traceback.print_exc()
             return []
     else:
         return []
Exemplo n.º 9
0
def identify_language(text):
    """
    Identifies language from string using polyglot package.

    :param text: String to use for language identification
    :return: Language name (English)
    """

    try:
        if text is not ('' or None):
            text = Text(text)
            language_code = text.language.code
            if language_code is not None:
                language_name = country.languages.get(
                    alpha_2=language_code).name
            else:
                language_name = None
        else:
            language_name = None
            language_code = None
    except AttributeError as ae:
        language_name = None
        language_code = None

    return language_name
Exemplo n.º 10
0
def get_quote_vector(entry, fast_text_models, enriched_collection):

    article = enriched_collection.find_one({"url": entry["source"]})

    fast_text = fast_text_models[entry["language"]]

    cleaned_quote = get_cleaned_content(entry["quote"])
    try:
        parsed = Text(cleaned_quote)

        tokens = [str(token).lower() for token in parsed.tokens]
    except pycld2.error as err:
        print(err)
        tokens = cleaned_quote.split()
    quote_vector = vectorize_tokens(tokens, fast_text)

    quote_vectors = []

    for talker in entry["talker"]:

        entity_key = talker["entity"].lower().replace(".", "DOT")
        entity_tokens = [
            token.lower()
            for token in article["cleaned_content_entities_parsed"][entity_key]
        ]

        entity_vector = vectorize_tokens(entity_tokens, fast_text)
        semantic_distance = cosine(quote_vector, entity_vector)

        quote_vectors.append(semantic_distance)

    return np.array(quote_vectors)
Exemplo n.º 11
0
    def transform(self, X):
        print("loading spacy model before transforming")
        #Evaluating if spacy spanish model exists
        nlp = spacy.load(self.model)
        print("evaluating sentiment2 model before transforming")
        #Evaluating if polyglot sentiment module exist
        try:
            Text("Evaluando existencia de sentiment2.es",
                 hint_language_code="es").polarity
        except:
            #Destination where we will copy sentiment2 pickle
            #This code of exception is designed to work on google app engine with /code/ as python root
            #Comment it and install package manually for other platforms
            print("sentiment2 not found. Copying...")
            parent_dest = str(downloader.default_download_dir())

            if not os.path.isdir(parent_dest + "/sentiment2/es"):
                createdPath1 = parent_dest + "/sentiment2/es"
                os.makedirs(createdPath1)

            copyfile(
                str(os.getcwd()) + "/processing/es.sent.pkl.tar.bz2",
                createdPath1 + "/es.sent.pkl.tar.bz2")
            print("copy finished")
        print("transforming")
        return numpy.concatenate(
            [singleSampleProcessPipeline(doc, nlp) for doc in X])
Exemplo n.º 12
0
def determine_text_languages(string):

    input_object = Text(input)

    temp_list = []

    for sentence in input_object.sentences:
        detect = Detector(str(sentence))
        if len(temp_list) == 0:
            temp_list.append([sentence, detect.language.code])
        else:
            if temp_list[-1][1] == detect.language.code:
                temp_list[-1][0] = temp_list[-1][0]+" "+sentence
            else:
                temp_list.append([sentence, detect.language.code])

    new_list= []

    for i in temp_list:
        new_list.append(i[0])

    output = []

    start = 0
    end = 0
    for sentence in new_list:
        detect = Detector(str(sentence))
        end = start + len(sentence)
        position = (start, end)
        start = end + 1
        output.append((detect.language.code, position, detect.language.confidence,))
    return output
Exemplo n.º 13
0
  def preprocess(self, out_file, lc):
    """Tokenize, (lowercase,) sub-word split.
    
    Using Polyglot since it was used for JW300.
    Preprocess the source column of a dataframe object and write to file.
  
    Pipeline:
    - tokenize
    - split into sub-words

    Append pre-processed sources to dataframe."""
    tokenized_sentences = []
    bped_sentences = []
    sources = []
    with open(out_file, 'w') as ofile:
      for i, row in self._src_df.iterrows():
        sentence_i = Text(row[0]).sentences[0]
        tokenized_sentence = ""
        bped_sentence = ""
        tokenized = " ".join(sentence_i.words)
        sources.append(str(sentence_i))
        if lc:
          tokenized = tokenized.lower()
        tokenized_sentence = tokenized
        bped = self._bpe_model.process_line(tokenized)
        bped_sentence = bped
        ofile.write("{}\n".format(bped))
        tokenized_sentences.append(tokenized_sentence)
        bped_sentences.append(bped_sentence)
    data = self._src_df.assign(
        tokenized_sentences=tokenized_sentences)
    data = data.assign(
        bped_sentences=bped_sentences)
    return data, sources
Exemplo n.º 14
0
    def Extract(self, request, context):
        text = request.text
        if text is None or not len(text.strip()):
            return

        entity_count = 0
        for language in request.languages:
            if language not in LANGUAGES:
                continue
            try:
                parsed = Text(text, hint_language_code=language)
                for entity in parsed.entities:
                    label = ' '.join(entity)
                    label = CLEAN.sub(' ', label)
                    label = collapse_spaces(label)
                    if len(label) < 4 or len(label) > 200:
                        continue
                    if ' ' not in label:
                        continue
                    length = entity.end - entity.start
                    entity_count += 1
                    yield ExtractedEntity(label=label,
                                          offset=entity.start,
                                          length=length,
                                          type=TYPES[entity.tag])
            except Exception:
                log.exception("Cannot extract. Language: %s", language)
        log.info("Extract: extracted %s entities.", entity_count)
Exemplo n.º 15
0
    def polyglot_tags(self):

        with open(config.file_location, errors="ignore") as r:
            inputtext = r.read()
        tok = ""
        with open("punkttokenizer_fullcorpus.pickle",
                  'rb') as f:  # using our custom tokenizer
            custom_sent_tokenizer = pickle.load(f)
        tokenized = custom_sent_tokenizer.tokenize(inputtext)
        tok += str(tokenized)
        text = Text(tok)
        if config.type_of_tag == "pos":
            print("\nPos tags using Polyglot\n")
            print(text.pos_tags)

        elif config.type_of_tag == "ner":
            print("\nNER tags using Polyglot\n")
            print(text.entities)
        elif config.type_of_tag == "mor":
            print("\nMorphemes using using Polyglot\n")
            # morphemes for cases where corpus is not formatted properly i.e. space between words are removed. eg "corporatebank" to "corporate" ,"bank"
            print(text.morphemes)
        else:
            print(
                "\n Please provide correct value to type_of_tag. Refer to the instructions\n"
            )
Exemplo n.º 16
0
def get_score(row):
    text = Text(row.full_text)
    try:
        score = sum([word.polarity for word in text.words])
    except ValueError as e:
        score = np.NaN
    return score
Exemplo n.º 17
0
def polyglot_sentiment():
    from polyglot.text import Text

    data = dict(default_data)
    data['message'] = "Sentiment Analysis API - POST only"
    data['sentiment'] = {}

    params = request.form  # postdata

    if not params:
        data['error'] = 'Missing parameters'
        return jsonify(data)

    if not params['text']:
        data['error'] = 'Text parameter not found'
        return jsonify(data)

    if not 'lang' in params:
        language = 'en'  # default language
    else:
        language = params['lang']

    polyglot_text = Text(params['text'], hint_language_code=language)
    data['sentiment'] = polyglot_text.polarity
    return jsonify(data)
Exemplo n.º 18
0
    def find_entites(self, input_text):
        lang = self.lang_detect(input_text)

        def get_tagged_tuple(word, entities):
            for e in entities:
                if word in list(e):
                    return (' '.join(e), e.tag)
            return (word, None)

        def remove_duplicates(l):
            seen = set()
            result = []
            for item in l:
                if item not in seen:
                    seen.add(item)
                    result.append(item)
            return result

        if lang in self.__langs:
            text = Text(input_text)
            sents_res = []
            for sent in text.sentences:
                word_tags = []
                for wrd in sent.words:
                    tagged_tuple = get_tagged_tuple(wrd, text.entities)
                    word_tags.append(tagged_tuple)
                sents_res.append(remove_duplicates(word_tags))
            res = {'lang': lang, 'ner_result': sents_res}
            return res
        else:
            raise LanguageNotAvailableError('Language is not available')
Exemplo n.º 19
0
def use_polyglot(text):
    """
    Input: string. Output: predicted language code.
    """
    text = Text(text)
    plang = text.language.code  
    return plang 
Exemplo n.º 20
0
async def get_words_and_phrases(text, text_language, user_language, user):
    sentences = list()
    processor = TextPreprocessor(CODES[text_language])
    # TODO save to cache the processors
    t = Text(text)
    subscribed = mysql_connect.check_subscribed(user)
    if not subscribed:
        await bot.send_message(user, "/subscribe to get translations for your text sentences")

    for s in t.sentences:
        sent = BotSentence(s.start, s.end)
        # TODO paid feature
        translation = await bot_utils.get_definitions(text_language, user_language, s.string, user)
        sent.translation = translation
        key_words = processor.key_words(s.string)
        for kw in key_words:
            w = kw[0]
            if ' ' in w:
                sent.words.append(w)
        for word in s.words:
            word = str(word)
            if re.match(r"[^\w]+", word) is not None:
                continue
            sent.words.append(word)
        sentences.append(sent)

    return sentences
    def getNERFromText(self, path, topicNumber, numberOfPages):

        allFiles = glob.glob(os.path.join(path, '*.txt'))
        allTopicPersonsFoundByNER = []
        topicNumber = topicNumber - numberOfPages

        while (topicNumber < len(allFiles)):
            file = open(allFiles[topicNumber], "r")
            textOfTopic = file.readlines()[0]
            text = Text(textOfTopic, hint_language_code='hr')
            person = []
            if (text):
                #print (text.sentences)
                #allKeyWords[:] += list(text.sentences)
                for entity in text.entities:
                    #print (entity)
                    if (entity.tag == "I-PER"):
                        #print (entity)
                        if (len(entity) >= 1):
                            name = ""
                            for en in entity:
                                #print (en)
                                name = name + en + " "

                            name = name.strip(" ")
                            if (name not in person):
                                person.append(name)
            #print (person)
            allTopicPersonsFoundByNER.append(person)
            topicNumber += 1
        return allTopicPersonsFoundByNER
Exemplo n.º 22
0
def part_of_speech(text, list_n, list_adj, list_v):

    ###############
    #
    # 3つの品詞(名詞、形容詞、動詞)-とその他の品詞-を別々に抽出する関数
    #
    # param >> text
    # param >> list_n: 名詞を格納するリスト
    # param >> list_adj: 形容詞を格納するリスト
    # param >> list_v: 動詞を格納するリスト
    #
    # return >> None
    #
    ###############

    for token in Text(text).pos_tags:
        #print(token)
        #特定の品詞のみ抽出
        if u'NOUN' in token:
            list_n.append(token[0])
        elif u'ADJ' in token:
            list_adj.append(token[0])
        elif u'VERB' in token:
            if token[0].lower() != "do" and token[0].lower() != "doing" and token[0].lower() != "did" and \
               token[0].lower() != "does" and token[0].lower() != "be" and token[0].lower() != "is" and \
               token[0].lower() != "are" and token[0].lower() != "were" and token[0].lower() != "was" and \
               token[0].lower() != "can":
                list_v.append(token[0])
        elif u'AUX' in token:
            if token[0].lower() != "do" and token[0].lower() != "doing" and token[0].lower() != "did" and\
               token[0].lower() != "does" and token[0].lower() != "be" and token[0].lower() != "is" and\
               token[0].lower() != "are" and token[0].lower() != "were" and token[0].lower() != "was" and\
               token[0].lower() != "can":
                list_v.append(token[0])
Exemplo n.º 23
0
def analyze(doc):
    if doc not in entities:
        entities[doc] = [
            "_".join(entity)
            for entity in Text(doc, hint_language_code="en").entities
        ]
    return entities[doc]
Exemplo n.º 24
0
def nGramCount(text, n):
    """
        Computes the count of the pos tag n-grams of the
        text supplied.

        Args:
            text (string): The text to be analysed for
                its' pos tag n-gram count
            n (int): Denotes the grams to extract
            language (string): Describes which language polyglot should expect
                to receive when analysing the supplied string

        Returns:
            dictionary: A dictionary with the n-gram as the key in the form
                of a tuple, and the count as the value
    """

    if 'postags' not in os.listdir('.'):
        posTagging = Text(text, hint_language_code='da')
        posTagging = np.array(
            [x[-1].encode('utf-8') for x in posTagging.pos_tags])
        np.savetxt('postags', posTagging, delimiter=',', fmt='%s')
    else:
        posTagging = np.loadtxt('postags', delimiter=',', dtype=str)

    new_posTagging = (tuple(posTagging[i:i + n][:])
                      for i in range(len(posTagging) - n + 1))
    return Counter(new_posTagging)
Exemplo n.º 25
0
    def analyze(self, document):
        if document.type in [document.TYPE_TABULAR, document.TYPE_OTHER]:
            return
        collector = DocumentTagCollector(document, self.ORIGIN)
        text = document.text
        if text is None or len(text) <= self.MIN_LENGTH:
            return
        try:
            hint_language_code = None
            if len(document.languages) == 1:
                hint_language_code = document.languages[0]
            text = Text(text, hint_language_code=hint_language_code)
            for entity in text.entities:
                if entity.tag == 'I-LOC' or len(entity) == 1:
                    continue

                label = ' '.join(entity)
                if len(label) < 4 or len(label) > 200:
                    continue
                collector.emit(label, self.TYPES[entity.tag])

        except ValueError as ve:
            log.info('NER value error: %r', ve)
        except Exception as ex:
            log.warning('NER failed: %r', ex)
        finally:
            log.info('Polyglot extracted %s entities.', len(collector))
            collector.save()
 def word_count(text, language_code='la'):
     text = Text(lemmatize(str(text)), hint_language_code=language_code)
     words = 0
     for w in text.words:
         if w.polarity == score:
             words += 1
     return words
Exemplo n.º 27
0
def do_sentiment_analysis():
    req_data = request.get_json()

    sentence = req_data['sentence']
    time = req_data['time']
    type_of_person = req_data['type_of_person']
    group_chat_name = req_data['group_chat_name']

    text = Text(sentence)

    sentiment = text.polarity

    cursor.execute(
        '''
                INSERT INTO SentiAna.dbo.Sentiment_Analysis(Sentence, Sentiment, TypeOfPerson, GroupName)
                VALUES
                (?,?,?,?)
                ''', (sentence, sentiment, type_of_person, group_chat_name))
    conn.commit()

    return '''
            status_code:{},
            status:{},
            message:{}
            '''.format('200', 'successful', 'Analysis was successful')
Exemplo n.º 28
0
def transliterate_to_arabic(text):
    # takes in a string
    words = re.findall(r"[\w']+|[.,،!?;]", text)
    # words = text.split(" ")
    # transliterate each word alone
    tr_words = []
    for word in words:
        if is_arabic(word) or has_numbers(word) or word.isupper():
            tr_words.append(word)
            continue
        if word == "" or word == "." or word == "," or word == "،":
            tr_words.append(word)
            continue
        word_poly = Text(word)
        try:
            tr_word = word_poly.transliterate("ar")
        except:
            word = word.encode("ascii", errors="ignore").decode()
            tr_words.append(word)
            continue
        for w in tr_word:
            if not w == "":
                tr_words.append(w)
                break
    # combine
    tr_text = ""
    for w in tr_words:
        tr_text += " "
        tr_text += w

    return tr_text[1:]  # just to remove whitespace
Exemplo n.º 29
0
def query():
    persons=[]
    organizations=[]
    locations=[]

    article = request.args.get('article')
    exclude = set(string.punctuation)

    #remove_punctuation
    article=''.join(ch for ch in article if ch not in exclude)

    #remove_english_letters
    article=re.sub(r'[a-zA-Z?]', '',article).strip()

    text = Text(article)

    #language detection
    language=text.language.name

    #NER
    ner=text.entities

    for entity in ner:
        if entity.tag=='I-PER':
           edit_per=" ".join(entity)
           if  edit_per  not in persons:
              persons.append(edit_per)
		elif entity.tag=='I-ORG':
             edit_org=" ".join(entity)
             if edit_org not in organizations:
                organizations.append(edit_org)
Exemplo n.º 30
0
    def __call__(self, text):
        """Performs language detection, tokenization, and named entity recognition.
        
        Args:
            text(str): text.
        
        Returns:
            Dictionary that contains:
            1. tokens - list of objects Token.
            2. entities - list of objects TaggedSpan.
            3. lang - str language of text.
        """

        pg_text = Text(text)
        token_spans = self._tokenize(pg_text)
        pg_entities = pg_text.entities
        ann_entities = []
        for pg_ent in pg_entities:
            tok_begin = token_spans[pg_ent.start]
            tok_end = token_spans[pg_ent.end - 1]
            ann_entities.append(
                ann.TaggedSpan(pg_ent.tag, tok_begin.begin, tok_end.end))

        return {
            'tokens': _make_tokens(token_spans, text),
            'entities': ann_entities,
            'lang': pg_text.language.code
        }