예제 #1
0
파일: MIRE.py 프로젝트: decode/Rpkg
def test():
  bigrams     = {}  # bigram as key, frequency as value
  tokens      = {}  # token as key, frequency as value
  tokencount  = 0   # number of tokens
  bigramcount = 0   # number of bigrams
  alphabet    = ""  # all characters used

  for i in sys.argv[1:]:
    for x in glob.glob(os.path.normcase(i)):
      try:
        file = open(x, "r")
        for i in file.readlines():
          #i = string.lower(string.strip(i))
          i = i.strip().lower()
          if i == "":
            continue
          wordlist = getWordList(i)
          bigrams, bigramcount = getBigrams(wordlist, bigrams, bigramcount)
          tokens, tokencount = getTokens(wordlist, tokens, tokencount)
        file.close()
      except IOError:
        file.close()

  print("Got total:\nBigrams: " + str(bigramcount) + "\nTokens: " + str(tokencount))
  print("Bigram\tFrequency\tRelative Frequency\tMutual Information\tRelative Entropy")
  #myTokens = string.split(i[0])
  for i in sortNgrams(bigrams):
    tokenlist = list(i)[0].split()
    re = RE(rF(i[1], bigramcount), P(tokenlist[1], tokens, tokencount), P(tokenlist[0], tokens, tokencount))
    #print(i[0] + "\t" + str(i[1]) + "\t" + str(rF(i[1], bigramcount)) + "\t" + str(MI(i[0], rF(i[1], bigramcount), tokens, tokencount)) + "\t" + str(RE(i[0], rF(i[1], bigramcount), rF(myTokens[1], tokencount), rF(myTokens[0], tokencount))))
    print(i[0] + "\t" + str(i[1]) + "\t" + str(rF(i[1], bigramcount)) + "\t" + str(MI(i[0], rF(i[1], bigramcount), tokens, tokencount)) + "\t" + str(re))
예제 #2
0
파일: MIRE.py 프로젝트: decode/Rpkg
def caculate(filename, freq=100):
  bigrams     = {}  # bigram as key, frequency as value
  tokens      = {}  # token as key, frequency as value
  tokencount  = 0   # number of tokens
  bigramcount = 0   # number of bigrams
  alphabet    = ""  # all characters used

  try:
    file = open(filename, "r")
    for i in file.readlines():
      i = i.strip().lower()
      if i == "":
        continue
      wordlist = getWordList(i)
      bigrams, bigramcount = getBigrams(wordlist, bigrams, bigramcount)
      tokens, tokencount = getTokens(wordlist, tokens, tokencount)
    file.close()
  except IOError:
    file.close()

  if os.path.exists("mi.txt"):
    os.remove('mi.txt')
  if os.path.exists("dict.txt"):
    os.remove('dict.txt')

  f = open("mi.txt", "w")
  fl = open("dict.txt", "w")

  print("Got total:\nBigrams: " + str(bigramcount) + "\nTokens: " + str(tokencount))
  #print("Bigram\tFrequency\tRelative Frequency\tMutual Information\tRelative Entropy")
  f.write("T1, T2, Frequency, Relative Frequency, Mutual Information, Relative Entropy\n")
  sep = ", "
  for i in sortNgrams(bigrams):
    tokenlist = list(i)[0].split()
    re = RE(rF(i[1], bigramcount), P(tokenlist[1], tokens, tokencount), P(tokenlist[0], tokens, tokencount))
    mi = MI(i[0], rF(i[1], bigramcount), tokens, tokencount)
    if mi > freq:
      f.write(tokenlist[1] + sep + tokenlist[0] + sep + str(i[1]) + sep + str(rF(i[1], bigramcount)) + sep + str(mi) + sep + str(re) + "\n")
      fl.write(tokenlist[1] + " " + tokenlist[0]+ "\n")
  f.close()
  fl.close()

  ret = merge("dict.txt", "data.basket")
  '''
예제 #3
0
파일: MIRE.old.py 프로젝트: decode/Rpkg
	return bigramprob * math.log(bigramprob/(px * py) , 2)


if __name__ == "__main__":
	bigrams     = {}	# bigram as key, frequency as value
	tokens      = {}	# token as key, frequency as value
	tokencount  = 0   # number of tokens
	bigramcount = 0   # number of bigrams
	alphabet    = ""  # all characters used

	for i in sys.argv[1:]:
		for x in glob.glob(os.path.normcase(i)):
			try:
				file = open(x, "r")
				for i in file.readlines():
					i = string.lower(string.strip(i))
					if i == "":
						continue
					wordlist = getWordList(i)
					bigrams, bigramcount = getBigrams(wordlist, bigrams, bigramcount)
					tokens, tokencount = getTokens(wordlist, tokens, tokencount)
				file.close()
			except IOError:
				file.close()

	print "Got total:\nBigrams: " + str(bigramcount) + "\nTokens: " + str(tokencount)
	print "Bigram\tFrequency\tRelative Frequency\tMutual Information"
	myTokens = string.split(i[0])
	for i in sortNgrams(bigrams):
		print i[0] + "\t" + str(i[1]) + "\t" + str(rF(i[1], bigramcount)) + "\t" + str(MI(i[0], rF(i[1], bigramcount), tokens, tokencount)) + "\t" + str(RE(i[0], rF(i[1], bigramcount), rF(myTokens[1], tokencount), rF(myTokens[0], tokencount)))
예제 #4
0
파일: ngrams.py 프로젝트: decode/Rpkg
	tokencount  = 0   # number of tokens
	bigramcount = 0   # number of bigrams

	for i in sys.argv[1:]:
		for x in glob.glob(os.path.normcase(i)):
			try:
				file = open(x, "r")
				for i in file.readlines():
					i = string.lower(string.strip(i))
					if i == "":
						continue
					bigrams, bigramcount = getNGrams(i, bigrams, bigramcount, 2)
					tokens, tokencount = getNGrams(i, tokens, tokencount, 1)
				file.close()
			except IOError:
				file.close()

	print "Got total:\nBigrams: " + str(bigramcount) + "\nTokens: " + str(tokencount)
	print "Bigram\tFrequency"
	for i in sortNgrams(bigrams):
		print i[0] + "\t" + str(i[1])
	print "Token\tFrequency"
	for i in sortNgrams(tokens):
		print i[0] + "\t" + str(i[1])
	print "Bigram\tRelative Frequency"
	for i in sortNgrams(bigrams):
		print i[0] + "\t" + str(float(i[1])/float(bigramcount))
	print "Token\tRelative Frequency"
	for i in sortNgrams(tokens):
		print i[0] + "\t" + str(float(i[1])/float(tokencount))
예제 #5
0
파일: REBToTo.py 프로젝트: decode/Rpkg
	global bigrams, tokens, bigramcount, tokencount
	pxy = float(bigrams[bigram])/float(bigramcount)
	px = float(tokens[token2])/float(tokencount)
	py  = float(tokens[token1])/float(tokencount)
	return py * math.log(py/(pxy/px), 2)


if __name__ == "__main__":
	for i in sys.argv[1:]:
		for x in glob.glob(os.path.normcase(i)):
			try:
				file = open(x, "r")
				for i in file.readlines():
					i = string.lower(string.strip(i))
					if i == "":
						continue
					wordlist = getTWordList(i)
					bigrams, bigramcount = getTBigrams(wordlist, bigrams, bigramcount, TOKEN, TOKEN)
					tokens, tokencount = getTTokens(wordlist, tokens, tokencount, TOKEN)
					bigramsleft, bigramsright = getTLRBigrams(wordlist, bigramsleft, bigramsright, TOKEN, TOKEN)
				file.close()
			except IOError:
				file.close()

	myTokens = sortNgrams(tokens)

	print "Left RE\tToken\tRight RE\tFrequency\tRelative Frequency"
	for x in range(min(len(myTokens), PRINTWORDS)):
		rre, lre = PRE(myTokens[x][0])
		print str(lre) + "\t" + myTokens[x][0] + "\t" + str(rre) + "\t" + str(myTokens[x][1]) + "\t" + str(float(myTokens[x][1])/float(tokencount))