Exemplo n.º 1
0
def split_word(sentence):
    sentence = format_sentence(sentence)
    words = sentence.split(' ')
    word_array = []
    hash_table = get_hashtable()

    i = 0
    len_ = len(words)

    while i < len_:
        if i + 2 < len_ and is_exist((words[i] + ' ' + words[i + 1] + ' ' + words[i + 2]).encode('utf-8'), hash_table)\
                == u'\u2713':
            word_array.append(
                Word(words[i] + '_' + words[i + 1] + '_' + words[i + 2],
                     u'\u2713'))
            i += 2
        elif i + 1 < len_ and is_exist(
            (words[i] + ' ' + words[i + 1]).encode('utf-8'),
                hash_table) == u'\u2713':
            word_array.append(Word(words[i] + '_' + words[i + 1], u'\u2713'))
            i += 1
        else:
            if is_exist(words[i].encode('utf-8'), hash_table) == u'\u2713':
                word_array.append(Word(words[i], u'\u2713'))
            else:
                if words[i] != '\n':
                    word_array.append(Word('~' + words[i], u'\u274C'))
                else:
                    word_array.append(Word(words[i], u'\u2713'))
        i += 1

    return word_array
Exemplo n.º 2
0
    def __init__(self, path, source='text'):
        self.words = []
        if source == 'umarker':  #xml from uncertaintymaker code
            import untangle
            template = untangle.parse(path)
            for word in template.transcription.words:
                if not word.word.cdata in ["[sil]", "[NOISE]", "[SPEECH]"]:
                    self.words.append(
                        Word(word.word.cdata, word.timeStart.cdata,
                             word.timeEnd.cdata))

        elif source == 'ldc-wrd':
            wrd_file = open(path, 'r')
            for line in wrd_file:
                elements = line.split(' ')
                word = elements[2].rstrip()
                time_from = self.convert_time(elements[0],
                                              from_time='ldc',
                                              to_time='ms')
                time_to = self.convert_time(elements[1],
                                            from_time='ldc',
                                            to_time='ms')
                self.words.append(Word(word, time_from, time_to))
            wrd_file.close()
        elif source == 'text':
            path = path.encode('ascii', 'ignore')
            for word in word_tokenize(path):
                if not word in ['"', "'", ".", "!", '``', '`', "''", '""']:
                    self.words.append(Word(word, '', ''))
Exemplo n.º 3
0
    def words(self, word, line):
        str = ''
        for w in word:
            if w in self.symbols and self.reading_string == False:
                if str != '':
                    if str.lower() in self.table:
                        self.channel_values.append(Word(
                            str, str.lower(), line))
                        #print(Word(str, self.table.get(str.lower()), line).toString())
                    else:
                        self.channel_values.append(
                            Word(str, 'identifier', line))
                        #print(Word(str, Tag.IDENTIFIER, line).toString())
                    str = ''
                self.channel_values.append(Token(w, w, line))
                #print(Token(w, self.symbols.get(w), line).toString())
            else:
                str += w
                if w == "'" and self.reading_string == False:
                    self.reading_string = True
                elif w == "'" and self.reading_string:
                    self.big_string += str
                    self.channel_values.append(String(self.big_string, line))
                    #print(String(self.big_string, line).toString())
                    self.reading_string = False
                    str = ''
                    self.big_string = ''

        if self.reading_string:
            self.big_string += str + ' '
        else:
            if str != '':
                self.channel_values.append(Word(str, 'identifier', line))
Exemplo n.º 4
0
    def test_IsWordLegal_IllegalWordBadCollisionsOneLetterAnchorDown_ReturnsFalse(
            self):

        # Arrange
        board = Board()
        t = Tile('t', -1, -1.0, -1)
        e = Tile('e', -1, -1.0, -1)
        s = Tile('s', -1, -1.0, -1)
        i = Tile('i', -1, -1.0, -1)
        n = Tile('n', -1, -1.0, -1)
        g = Tile('g', -1, -1.0, -1)

        hand = Hand("test",
                    [t, e, s, t, i, n, g, t, e, s, t, i, n, g, s, i, t])
        testing = Word([t, e, s, t, i, n, g])
        sit = Word([s, i, t])

        # Act
        board.PlaceWord(testing, board.GetAnchors()[0], hand, 0, 'down')
        board.PlaceWord(sit, board.GetAnchors()[3], hand, 1, 'across')
        results = board.IsWordLegal(testing, board.GetAnchors()[6], 3, 'down')

        # Assert
        #board.PrintBoard()
        self.assertFalse(results[0])
        self.assertEqual(results[1],
                         'word creates an invalid word when placed')
Exemplo n.º 5
0
	def get(self,word,stress_ambiguity=True):
		# python3 unnecessary:
		#if type(word)==str:
		#	word=word.decode('utf-8',errors='ignore')

		(word,punct)=gleanPunc(word)

		if self.has(word):
			words=self.dict['Word'][word.lower()]
		elif self.getprep:
			words=self.getprep(word,config=self.config)
		else:
			return [Word(word,[],None)]

		if not words:
			return [Word(word,[],None)]

		if type(words)==list:
			if type(words[0])==tuple:	# New word needs to be built
				wordobjs=[]
				for wordtuple in words:
					wrd=wordtuple[:2]
					attrs=wordtuple[2] if len(wordtuple)>2 else {}
					wordobj=self.make(wrd,word)
					for _k,_v in list(attrs.items()): setattr(wordobj,_k,_v)
					wordobjs+=[wordobj]
				self.dict['Word'][word.lower()]=wordobjs
				return self.maybeUnstress(wordobjs) if stress_ambiguity else wordobjs
			else:
				wordobjs=words
		else:
			wordobjs=[words]

		return self.maybeUnstress(wordobjs) if stress_ambiguity else wordobjs
Exemplo n.º 6
0
 def parse_matrix(self, matrix):
     self.__check_format(matrix)
     for i in xrange(len(matrix)):
         words_coordinates = self.__extract_words_coordinates(matrix[i])
         for (x1, x2) in words_coordinates:
             self.horizontal_words.append(Word(i, x1, x2))
     for j in xrange(len(matrix[0])):
         vertical_list = [list[j] for list in matrix]
         words_coordinates = self.__extract_words_coordinates(vertical_list)
         for (y1, y2) in words_coordinates:
             self.vertical_words.append(Word(j, y1, y2))
     self.__build_tree(self.horizontal_words[0], False)
def read_input():
    input_file = open("input.txt", "r")
    line1 = input_file.readline().rstrip("\n").split("OR")
    for i in line1:
        a.add_word(Word(i.strip()))

    num = int(input_file.readline())
    for i in range(num):
        line = input_file.readline().rstrip("\n").split("OR")
        new_sen = Sentence()
        for j in line:
            new_sen.add_word(Word(j.strip()))
        KB.append(new_sen)
Exemplo n.º 8
0
def toCSVEntry(str):
    elements = str.split('\t')
    rel = elements[1]
    source = elements[2]
    target = elements[3]

    sourceForm = ''
    targetForm = ''
    sourceCategory = ''
    targetCategory = ''

    if source.endswith('/n') or source.endswith('/v') or source.endswith(
            '/a') or source.endswith('/s') or source.endswith('/r'):
        sourceForm = source[:-2].replace('/c/fr/', '').replace('_',
                                                               ' ').strip()
        sourceCategory = source[len(source) - 1:]
    else:
        sourceForm = source.replace('/c/fr/', '').replace('_', ' ').strip()

    if target.endswith('/n') or target.endswith('/v') or target.endswith(
            '/a') or target.endswith('/s') or target.endswith('/r'):
        targetForm = target[:-2].replace('/c/fr/', '').replace('_',
                                                               ' ').strip()
        targetCategory = target[len(target) - 1:]
    else:
        targetForm = target.replace('/c/fr/', '').replace('_', ' ').strip()

    line = sourceCategory + "\t" + source + "\t" + sourceForm + "\t" + rel + "\t" + targetCategory + "\t" + target + "\t" + targetForm + "\n"

    idSource = toId(sourceForm, sourceCategory)
    idTarget = toId(targetForm, targetCategory)

    sourceWord = None
    targetWord = None

    try:
        sourceWord = words[idSource]
    except KeyError:
        sourceWord = Word(sourceForm, sourceCategory)
        words[idSource] = sourceWord

    try:
        targetWord = words[idTarget]
    except KeyError:
        targetWord = Word(targetForm, targetCategory)
        words[idTarget] = targetWord

    sourceWord.add_relation(rel, targetWord)
    #sourceWord.print_relations_count()
    return line
Exemplo n.º 9
0
 def add(self, word, documentID, occurences):
     if word in self.cache.keys():
         self.cache[word].add(documentID, occurences)
     else:
         wordInstance = Word(word, self.directory)
         wordInstance.add(documentID, occurences)
         self.cache[word] = wordInstance
Exemplo n.º 10
0
def main():
    content = fileReader()
    content = [Word(str, targetWord) for str in content]
    print(f"Created {len(content)} words")
    content = [word for word in content if word.isValid]
    print(f"Reduced to {len(content)} valid candidates.")
    findMatches(content)
Exemplo n.º 11
0
    def WriteSonnet(self):
        self.mySonnet = []
        for i in range(self.desiredLines):
            line = []
            followingWord = Word("@")

            syllables = 0

            if (self.rhymeLines[i] >= 0):
                followingWord = self.wordChain.GetRhymingWord(
                    self.mySonnet[self.rhymeLines[i]][-1], self.rhymeLevel)
                line.insert(0, followingWord.GetWord())
                syllables += followingWord.CountSyllables()

            while syllables < self.desiredLength:
                nextWord = self.wordChain.GetRandomLeader(
                    followingWord.GetWord())
                for k in range(1, 5):
                    if nextWord.GetWord() != "@":
                        break
                    nextWord = self.wordChain.GetRandomLeader(
                        followingWord.GetWord())
                if nextWord.GetWord() == "@":
                    break
                line.insert(0, nextWord.GetWord())
                followingWord = nextWord
                followingWord.GetWordStress()
                syllables += nextWord.CountSyllables()

                self.PrintProgress(i + 1, syllables)

            self.mySonnet.append(line)
Exemplo n.º 12
0
 def FindBestMove(self, hand, board):
     #Return word, anchor, anchorindex, direction
     #Get words for each anchor, find best word, compare best words sequentially
     anchors = board.GetAnchors()
     random.shuffle(anchors)
     bestWord = Word()
     bestWord.SetScore(min)
     for anchor in anchors:
         # get list of possible words for each anchor
         # match words to hand
         words = self.MatchWords(hand, anchor, board)
         # check for case no legal move is found
         if words is not None :
             # set scores for words, find best word
             for word in words.keys():
                 word.SetScore(self.heuristic.ScoreWord(word, hand))
                 if word.GetScore() > bestWord.GetScore() :
                     bestWord = word
                     bestAnchor = anchor
                     bestIndex = words[word][0]
                     bestDirection = words[word][1]
     # check for case no legal move is found
     if bestWord.GetScore() is min:
         raise Exception("BRI: No valid word options found!")
     return bestWord, bestAnchor, bestIndex, bestDirection
Exemplo n.º 13
0
 def MatchWords(self, hand, anchor, board):
     # match available tiles in hand to possible words for a certain anchor
     anchorWords = anchor.GetPossibleWords()
     handTiles = hand.PeekHand()
     anchorTile = anchor.GetTile()
     if anchorTile.GetLetter() is " ":
         handTiles.append(anchorTile)
     tiles = handTiles
     totalHand = Word(tiles)
     options = anchorWords.WordSearch(totalHand)
     optionsCleaned = dict()
     direction = anchor.GetDirection()
     timeStart = time.time()
     shuffledOptions = list(options.GetDict().values())
     random.shuffle(shuffledOptions)
     #print(shuffledOptions)
     for strWordList in shuffledOptions:
         for strWord in strWordList:
             if (len(strWord) <= len(handTiles)) :
                 word = self.MakeItWord(strWord)
                 if anchor.GetLetter() is " ":
                     indices = [int(len(strWord)/2)]
                 else:
                     indices = [i for i, a in enumerate(word.GetString()) if a == anchor.GetLetter() ]
                 for i in indices:
                     if board.IsWordLegal(word, anchor, i, direction):
                         optionsCleaned[word] = (i, direction)
         timeDiff = time.time() - timeStart
         if (timeDiff > 5):
             break
     return optionsCleaned
Exemplo n.º 14
0
    def handle_response(self, response):
        soup = BeautifulSoup(response.decode('utf-8', 'ignore'))

        table = soup.find('table', {'border': '1'})
        is_going_on = False
        for tr in table.findAll('tr'):
            is_going_on = True
            for td in tr.findAll('td'):
                td_value = str(td)
                word = re.search(r'(.*)kelime=(.+)&amp;c(.*)',
                                 td_value).group(2)

                is_letter = word.__len__() == 1
                is_proper_name = word[0].isupper()
                is_phrase = word.__contains__(' ')

                w = Word(word, is_letter, is_proper_name, is_phrase)

                if DictionaryConfig.detailed_log:
                    print 'word consumed #', DictionaryService.regular_words.__len__(
                    ) + 1, ':', w.get_value()

                if w.is_regular():
                    DictionaryService.regular_words.append(w)
                elif w.is_proper_name:
                    DictionaryService.proper_words.append(w)
                elif w.is_phrase:
                    DictionaryService.phrases.append(w)

                DictionaryService.fs.write(w.formatted_value() + "\n")

        return is_going_on
Exemplo n.º 15
0
 def addWordToList(self, word, fWord):
     if self.wordInList(word):
         return False
     else:
         newWord = Word(word, fWord)
         self.wordList.append(newWord)
         return True
Exemplo n.º 16
0
def insert_words(fv, hash_set):
    """
    -------------------------------------------------------
    Retrieves every Word in fv and inserts into
    a Hash_Set.
    Each Word object in hash_set contains the number of comparisons
    required to insert that Word object from file_variable into hash_set.
    -------------------------------------------------------
    Parameters:
        fv - the already open file containing data to evaluate (file)
        hash_set - the Hash_Set to insert the words into (Hash_Set)
    Returns:
        None
    -------------------------------------------------------
    """
    lines = fv.read()
    words = lines.split()

    for word in words:

        if word.isalpha():

            k = Word(word.lower())
            hash_set.insert(k)

    return
Exemplo n.º 17
0
def insert_words(fv, hash_set):
    """
    -------------------------------------------------------
    Retrieves every Word in fv and inserts into
    a Hash_Set.
    Each Word object in hash_set contains the number of comparisons
    required to insert that Word object from file_variable into hash_set.
    -------------------------------------------------------
    Parameters:
        fv - the already open file containing data to evaluate (file)
        hash_set - the Hash_Set to insert the words into (Hash_Set)
    Returns:
        None
    -------------------------------------------------------
    """
    
    fv.seek(0)


    for line in fv:
        for word in line.strip().split():
            
            if word.isalpha():
                l = Word(word.lower())
                hash_set.insert(l)
                
    fv.close()
    return
Exemplo n.º 18
0
 def test_freqency_query_of_empty_word(self):
     """
     Make sure code still runs if an empty string is passed into frequency
     """
     word = Word("", "")
     querier = DataMuseQuerier()
     self.assertTrue(len(str(querier.get_frequency(word))) > 0)
Exemplo n.º 19
0
 def test_frequency_query_of_fake_word(self):
     """
     Make sure a non-english word typed in still returns something and doesn't throw an error
     """
     fake_word = Word("asdfjk", "")
     querier = DataMuseQuerier()
     self.assertTrue(len(str(querier.get_frequency(fake_word))) > 0)
Exemplo n.º 20
0
def filter_corpus_for_relevance(filein, fileout):
    print "Converting ", filein, " to ", fileout
    print "\tReading ", filein
    inpt = open(filein, 'r')
    content = filter(lambda x: not x == "",
                     inpt.readlines()[0].replace("\n", "").split(" "))
    inpt.close()

    print "\tNow filtering and writing relevant words"
    print "\t", len(content), "words to go..."
    outpt = open(fileout, 'w')
    cache = dict()
    for i in xrange(len(content)):
        if i % 1000000 == 0:
            print "\t\tIteration", i
        word = content[i]
        if word not in cache:
            word_object = Word(word)
            if word_object.relevant():
                cache[word] = word_object.lemma()
            else:
                cache[word] = False
        token = cache[word]
        if not token == False:
            outpt.write(token + " ")
    outpt.close()
    print "Done!"
Exemplo n.º 21
0
 def tokenize(self, sent, pos_tags):
     head = Word()
     head.word = [sent[0].lower()]
     head.actual = [sent[0]]
     head.pos = pos_tags[0]
     curr = head
     length = len(sent)
     for i in xrange(1, len(sent)):
         new_word = Word()
         new_word.left = curr
         curr.right = new_word
         new_word.word = [sent[i].lower()]
         new_word.actual = [sent[i]]
         new_word.pos = pos_tags[i]
         curr = new_word
     return head, length
Exemplo n.º 22
0
 def test_best_synonym_of_empty_word(self):
     """
     Make the best_synonym function returns an empty string for an empty string
     """
     word_substitutor = WordSubstitutor()
     self.assertEqual(word_substitutor.get_best_synonym(Word("", "")).get_word()\
                      , "")
Exemplo n.º 23
0
	def make(self,stressedipasylls_text,token):
		stressedipa=stressedipasylls_text[0]
		sylls_text=stressedipasylls_text[1]

		stress=stressedipa2stress(stressedipa)
		(prom_stress,prom_strength)=getStrengthStress(stress)
		syllphons=self.ipa2phons(stressedipa)

		sylls=[]

		for i in range(len(syllphons)):
			syllbody=self.use('SyllableBody',syllphons[i])
			syll=self.use('Syllable',(syllbody,prom_strength[i],prom_stress[i]))
			#print token,i,syllbody,syll,syllphons,stressedipa,stress,prom_stress,prom_strength
			sylls.append(syll)

		word=Word(token,sylls,sylls_text)
		word.ipa=stressedipa
		word.stress=stress
		word.lang=self.lang

		# when is word broken?
		if not word.ipa:
			word.broken=True


		return word
Exemplo n.º 24
0
    def pseudo2real(self, pseudo_words, increment=False):
        """ Convert a pseudo sentence to a real sentence.
            If increment is True, we update the occurence
        """
        #Word._connection.debug=True
        self.words = []
        for pword in pseudo_words:
            try:
                real_word = Word.byAppeared_name(
                    pword['appeared_name'].encode('utf-8'))
                #Do we increment?
                if increment: real_word.increment()
            except SQLObjectNotFound:
                #We don't have the word yet
                try:
                    main_type = MainType.byName(
                        pword['main_type'].encode('utf-8'))
                except SQLObjectNotFound:
                    main_type = MainType(name=pword['main_type'])
                try:
                    sub_type = SubType.byName(
                        pword['sub_type'].encode('utf-8'))
                except SQLObjectNotFound:
                    sub_type = SubType(name=pword['sub_type'])

                # We create a new word object
                real_word = Word(appeared_name=pword['appeared_name'],
                                 appeared_reading=pword['appeared_reading'],
                                 base_name=pword['base_name'],
                                 base_reading=pword['base_reading'],
                                 main_type=main_type.id,
                                 sub_type=sub_type.id)
            self.words.append(real_word)
Exemplo n.º 25
0
    def new_word(self):
        word1 = self.text1.get().lower().lstrip().rstrip()
        word2 = self.text2.get().lower().lstrip().rstrip()

        if len(word1) == 0 or len(word2) == 0:
            self.text2.insert(0.0, "Зполните все поля")
            return 0
        elif (word1[0] in ABV_english and word2[0] in ABV_english) or (word1[0] in ABV_russian and word2[0] in ABV_russian):
            self.text3.delete(0.0, END)
            self.text3.insert(0.0, "Так ниизяяя, подумайте дважды")
        else:
            languages = ['russian', 'english'] if word1[0] in ABV_russian else ['english', 'russian']
            
            word = Word(word=languages[0], translate=languages[1])
            result = word.set(word1, word2)

            if result == -1:
                self.text3.delete(0.0, END)
                self.text3.insert(0.0, "Такой вид перевода уже имеется")
            else:
                file = open('new_words.txt', 'a')
                file.write(word1+'-'+word2+'\n')
                file.close()
                
                self.text3.delete(0.0, END)
                self.text3.insert(0.0, "Перевод записан!")
Exemplo n.º 26
0
def buildWords(word_vectors, features, f_types):
    words = []
    for word_vector in word_vectors:
        word = Word(word_vector, features, f_types)
        words.append(word)

    return words
Exemplo n.º 27
0
def comparison_total(hash_set):
    """
    -------------------------------------------------------
    Sums the comparison values of all Word objects in hash_set.
    -------------------------------------------------------
    Parameters:
        hash_set - a hash set of Word objects (Hash_Set)
    Returns:
        total - the total of all comparison fields in the Hash_Set
            Word objects (int)
        max_word - the word having the most comparisons (Word)
    -------------------------------------------------------
    """

    total = 0
    max_word = Word('a')

    for word in hash_set:

        total += word.comparisons

        if word.comparisons > max_word.comparisons:

            max_word = word

    return total, max_word
Exemplo n.º 28
0
 def __init__(self):
     print("Word_data_Based_on : . https://github.com/KKuTu-Korea/KKuTu")
     print("-----------------------------------------------------------")
     self.word = Word()
     self.words = {}  # 단어 dict
     self.thirdword = ['', '', '']
     self.usedword = []
Exemplo n.º 29
0
def insert_words(fv, hash_set):
    """
    -------------------------------------------------------
    Retrieves every Word in fv and inserts into
    a Hash_Set.
    -------------------------------------------------------
    Parameters:
        fv - the already open file containing data to evaluate (file)
        hash_set - the Hash_Set to insert the words into (Hash_Set)
    Returns:
        Each Word object in hash_set contains the number of comparisons
        required to insert that Word object from file_variable into hash_set.
    -------------------------------------------------------
    """
    fv.seek(0)
    lines = fv.readlines()

    for line in lines:
        #print("[{}]".format(line.rstrip()))
        words = line.split(' ')
        for word in words:
            if word.isalpha():
                #print("Word: {}".format(word))
                # Ignoring any punctuation and words with punctuation
                _word = Word(word.lower())
                hash_set.insert(_word)
    return
Exemplo n.º 30
0
    def test_AnchorSearch_FindingWordsWithScript_ReturnsCorrectWords(self):

        # Arrange
        words = Words()
        selectedKeys = []
        s = Tile('S', 1, 0.09480520300163461, 3)
        c = Tile('C', 3, 0.04049934678472928, 29)
        r = Tile('R', 1, 0.07098146383333229, 11)
        i = Tile('I', 1, 0.0885545324304026, 5)
        p = Tile('P', 3, 0.029410465329100584, 41)
        t = Tile('T', 1, 0.06566549066880407, 17)
        word = Word([s, c, r, i, p, t])
        allWordsContainWord = True
        noWordsContainWord = True

        # Act
        selectedWords = words.AnchorSearch(word)
        for key in selectedWords.GetDict():
            selectedKeys.append(key)
            for element in selectedWords.GetDict().get(key):
                if word.GetString() not in element:
                    allWordsContainWord = False
        for key in words.GetDict():
            if key not in selectedKeys:
                for element in words.GetDict().get(key):
                    if word.GetString() in element:
                        noWordsContainWord = False
                        print(element, "contains the word", word.GetString())

        # Assert
        self.assertTrue(allWordsContainWord)
        self.assertTrue(noWordsContainWord)