def get_sentences(raw_text): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') fp = open(raw_text) data = fp.read().decode('utf-8') callback = lambda pat: '. ' + pat.group(1)[1:] data = re.sub(r'(\.[A-Z])', callback, data) data = re.sub(r'(\.\[[A-Z])', callback, data) with open('temp_china_business.txt', 'w+') as g: g.write(data.encode('utf-8')) return tokenizer.tokenize(data)
def emodify(): # Part I: Split into sentences input = open("uploads/input.txt","r") output = open("uploads/input1.txt", "w") tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') data = input.read() data = ''.join(filter(lambda x: ord(x)<128, data)) output.write('\n-----\n'.join(tokenizer.tokenize(data.encode('ascii')))) input.close() output.close() # Part II: Split into lines input = open("uploads/input1.txt","r") output = open("uploads/input2.txt", "w") for line in input: line = line.strip("\n") if (line == "-----"): output.write("\n") else: output.write(line + " ") input.close() output.close() # Part III: Removing digits, numbers, some punctuation input = open("uploads/input2.txt","r") output = open("uploads/input3.txt", "w") for line in input: line = line.replace("--","\n") output.write(re.sub("\d+", "", line)) input.close() output.close() # Part IV: Punctuation removal filein2 = open("uploads/input3.txt","r") fileout = open("uploads/inputfinal.txt", "w") for linex in filein2: for word in linex.split(): worda = word.lower() worda = ''.join(c for c in worda if c not in punctuation) fileout.write(worda + " ") fileout.write("\n") filein2.close() fileout.close() # Delete unwanted files os.remove("uploads/input.txt") os.remove("uploads/input1.txt") os.remove("uploads/input2.txt") os.remove("uploads/input3.txt")
def GetAttachments(msg_ids): """Get and store attachment from Message with given id. Args: service: Authorized Gmail API service instance. user_id: User's email address. The special value "me" can be used to indicate the authenticated user. msg_id: ID of Message containing attachment. store_dir: The directory used to store attachments. """ credentials = get_credentials() http = credentials.authorize(httplib2.Http()) service = discovery.build('gmail', 'v1', http=http) user_id = "me" for x in msg_ids: try: message = service.users().messages().get(userId='me', id=x['id']).execute() for part in message['payload']['parts']: if part['filename']: if 'data' in part['body']: data=part['body']['data'] else: att_id=part['body']['attachmentId'] att=service.users().messages().attachments().get(userId=user_id, messageId=x['id'],id=att_id).execute() data=att['data'] file_data = base64.urlsafe_b64decode(data.encode('UTF-8')) path = part['filename'] if 'csv' in path[-5:]: print(path) with open(path, 'wb') as f: f.write(file_data) f.close() labels(x['id']) except errors.HttpError: print('Error XX')
def master(): a = [] global fname1 global fname2 fp1 = open(fname1) data1 = fp1.read() values = {'s': 'basics', 'submit': 'search'} data = urllib.parse.urlencode(values) data = data.encode('utf-8') # to search query = (data1) for j in search(query, tld="co.in", num=10, stop=3, pause=2): print(j) try: req = urllib.request.Request( j, data, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' }) resp = urllib.request.urlopen(req) html = resp.read() except: print("fail") continue def tag_visible(element): if element.parent.name in [ 'style', 'script', 'head', 'title', 'meta', '[document]', 'header id' ]: return False if isinstance(element, Comment): return False return True def text_from_html(body): global visible_texts soup = BeautifulSoup(body, 'html.parser') texts = soup.findAll(text=True) visible_texts = filter(tag_visible, texts) return u" ".join(t.strip() for t in visible_texts) # print (text_from_html(html)) # for eachP in (text_from_html(html)): fileout = open(fname2, "w", encoding="utf-8") fileout.write(text_from_html(html)) fileout.close() def splitSen(): global a global b global fname1 global fname2 tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') fp1 = open(fname1, encoding="utf-8") fp2 = open(fname2, encoding="utf-8") data1 = fp1.read() data2 = fp2.read() a = (tokenizer.tokenize(data1)) b = (tokenizer.tokenize(data2)) # print("Sentence 1: ") def compare(): global flag global a global b global common global common1 global notcommon global notcommonU global common_list global synonyms global fname3 global filtered_sentence1 global filtered_sentence2 global count global count1 tokenizer = RegexpTokenizer(r'\w+') stop_words = set(stopwords.words('english')) # print("\nCopied Sentences:\n") fp3 = open(fname3, "a") for i in range(len(a)): word_tokens = tokenizer.tokenize(a[i]) for y in word_tokens: if y not in stop_words: count.append(y) for j in range(len(b)): word_tokens1 = tokenizer.tokenize(a[i]) word_tokens2 = tokenizer.tokenize(b[j]) for w in word_tokens1: if w not in stop_words: filtered_sentence1.append(w) for x in word_tokens2: if x not in stop_words: filtered_sentence2.append(x) common = set(filtered_sentence1) & set(filtered_sentence2) common_words = ", ".join(common) if len(common) / len(filtered_sentence1) > 0.7: for word in set(filtered_sentence1): count1[word] = count1.get(word, 0) + 1 for word in set(filtered_sentence2): count1[word] = count1.get(word, 0) + 1 for word in count1: if count1[word] == 1 and word in filtered_sentence1: notcommonU.append(word) # for word in set(filtered_sentence1) : # notcommonU.remove(word) for wordM in (notcommonU): stemmedword = stemmer.stem(wordM) for syn in wordnet.synsets(stemmedword): for l in syn.lemmas(): if l.name() not in synonyms: synonyms.append(l.name()) # print(set(synonyms)) for wordsP in filtered_sentence2: stemmedword2 = stemmer.stem(wordsP) # print(stemmedword2) if stemmedword2 in set(synonyms): # print(wordsP) if wordsP not in common1: common1.append(wordsP) common_list.append(wordM) if len(common) / len(filtered_sentence1) > 0.7: common_list.extend(common) fp3.write(",".join(common_list)) fp3.write("//////") # print('****************************') # print("Sentence 1: "+ str(a[i])) # print("Sentence 2: "+ str(b[j])) # print('----------------------------') # print("Word Tokens 1: "+ str(word_tokens1)) # print("Word Tokens 2: "+ str(word_tokens2)) # print('----------------------------') non_bmp_map = dict.fromkeys( range(0x10000, sys.maxunicode + 1), 0xfffd) # print("Filtered Tokens 1: "+ str(filtered_sentence1).translate(non_bmp_map)) # print("Filtered Tokens 2: "+ str(filtered_sentence2).translate(non_bmp_map)) # print('----------------------------') # print("Common words are: " + str(common)) # print('****************************') filtered_sentence1 = [] filtered_sentence2 = [] count1 = {} # DISPLAY OF FINAL OUTPUT def perCal(): global count print("common_list is :" + str(common_list)) print("notcommon list is : " + str(notcommonU)) print("replaced :" + str(common1)) # print("synonyms replaced are:",+ str(common1)) per = (float(len(common_list) / len(count))) * 100.00 print("\nPercentage copied is", format(per, '.2f')) #DISPLAY THE COPIED CONTENT def display(): global fname1 global common_list file_path = fname1 while not os.path.exists(file_path): file_path = input( "The path does not exists, enter again the file path: ") with open(file_path, mode='rt', encoding='utf-8') as f: text = f.read() # search_wordA = "database is oraganised collection of data" for search_word in common_list: if search_word in text: # print() print( text.replace( search_word, '\033[44;33m{}\033[m'.format(search_word))) splitSen() compare() perCal() display()