for c in range(len(tokens) - 3): if (tokens[c][1] == TYPE_WORD and tokens[c + 1][0] == "-" and tokens[c + 2][0] == "\n" and (tokens[c + 3][1] == TYPE_WORD or tokens[c + 3][1] == TYPE_PAGEMARKER)): word = tokens[c][0] + tokens[c + 1][0] if tokens[c + 3][1] == TYPE_PAGEMARKER: if len(tokens) < c + 4: continue word += tokens[c + 4][0] else: word += tokens[c + 3][0] if word == soft_hyphen_dict[word]: tokens[c + 1][0] += u"%" else: tokens[c + 1][0] = u"%" return tokens text = unicode(sys.stdin.read(), "utf-8") soft_hyphen_dict = {} for line in file(sys.argv[1]): fields = unicode(line, "utf-8").split() soft_hyphen_dict[fields[0]] = fields[2] tokens = language_specials(tokenise(text, True)) tokens = process_hyphens(soft_hyphen_dict, tokens) for t in tokens: sys.stdout.write(t[0].encode("utf-8"))
if len(sys.argv) < 4 or not os.access(sys.argv[1], os.R_OK): print "Usage: %s project_file user [old_file]" % sys.argv[0] sys.exit(-1) # get a list of relevant pages pd = project_data.ProjectData(sys.argv[1]) pages = pd.get_pages(sys.argv[2]) if len(pages) == 0: print "No pages for user " + sys.argv[2] sys.exit(-2) # build the pages into a single block of text text = u"" for p in pages: text += unicode(pd.get_text(p[0], sys.argv[2])[project_data.DATA], "utf-8") text += u"\n" # tokenise the text tokens = tokenise(text) # make apostrophes and hyphens part of words for c in range(len(tokens) - 2): if ( tokens[c][1] == TYPE_WORD and (tokens[c + 1][0] == "'" or tokens[c + 1][0] == "-") and tokens[c + 2][1] == TYPE_WORD ): tokens[c + 2] = [tokens[c][0] + tokens[c + 1][0] + tokens[c + 2][0], TYPE_WORD] tokens[c] = tokens[c + 1] = ["", TYPE_UNKNOWN] tokens = [X for X in tokens if X[1] != TYPE_UNKNOWN] # split out eol hyphenated words eol_hyphenated = set() for c in range(len(tokens) - 3): if ( tokens[c][1] == TYPE_WORD
from pyspark.mllib.fpm import FPGrowth from operator import itemgetter from tokenise import * from pyspark import SparkContext sc = SparkContext('local', 'Exam_3') data = open("output.txt","r").read() data=data.split("\n@@@\n") data2=[] f=open("newoutput.txt","w").write("") for i in data: for j in set(tokenise(i)): if j!='"rt': with open("newoutput.txt","a") as f: f.write(j+" ") with open("newoutput.txt","a") as f: f.write("\n") data=sc.textFile("newoutput.txt") #print data2 print "starting" transactions = data.map(lambda line: line.strip().split(' ')) model = FPGrowth.train(transactions, minSupport=0.001, numPartitions=10) result = sorted(model.freqItemsets().collect(),key=lambda x: len(x[0]), reverse=True) for fi in result: print(fi)
import operator with open("small.txt") as f: data=f.read() print "haiyya" data=data.split("@@@") tf=[] idf={} N=1 tweetsize=[] for tweet in data: print "tweet"+str(N) N+=1 te=tokenise(tweet) tweetsize.append(len(te)) dic={} for i in te: try: dic[i]+=1 except KeyError: dic[i]=1 tf.append(dic) for key in dic: try: idf[key]+=1 except KeyError: idf[key]=1 for i in xrange(len(tf)): for j in tf[i]:
print "Usage: %s my_file their_file soft_hyphens_file" sys.exit(-1) soft_hyphen_dict = {} for line in file(sys.argv[3]): fields = unicode(line, "utf-8").split() soft_hyphen_dict[fields[0]] = fields[2] sm = difflib.SequenceMatcher() chapters_1 = split_chapters(unicode(file(sys.argv[1]).read(), "utf-8")) chapters_2 = split_chapters(unicode(file(sys.argv[2]).read(), "utf-8")) chapter_str = [] for ch in range(20): seq1 = make_sequence(remove_soft_hyphens(soft_hyphen_dict, language_specials(tokenise(chapters_1[ch])))) seq2 = make_sequence(join_hyphens(language_specials(tokenise(chapters_2[ch])))) sm.set_seqs(seq1, seq2) outstr = u"<p>" matches = sm.get_matching_blocks() for c, m in enumerate(matches): if m[2] == 0: break #m[2] is count of matching paras next_match = matches[c + 1] outstr += "".join(seq1[m[0]:m[0] + m[2]]).replace("\n", "</p><p>") diff1 = seq1[m[0] + m[2]: next_match[0]] for t in diff1: if t == "\n": t = "\\n" outstr += "<span class='diff1'>%s</span>" % t diff2 = seq2[m[1] + m[2]: next_match[1]]