예제 #1
0
def main(argv):

	pd_file = argv[1]
	country = argv[2]
	territory = argv[3]

	ignore_words_cls = IgnoreWords()
	postgres_interface_cls = PostgresInterface()

	df_cls = PandaDataFrame(pd_file)	

	for extracted_row in extract_row_generator(df_cls.df):
		index, row =  extracted_row

		## Clean company_name
		company_name = clean(row["Company Name"]).lower()
		
		## Ignore words
		company_keywords_list = ignore_words_cls.return_keyword_lists(company_name)

		print ("*****************")
		print (row["Company Name"])
		print (company_keywords_list)

		## Find matches in DB using keywords , country and territory
		crm_results = postgres_interface_cls.get_record_match(company_name, company_keywords_list, country, territory)


		## Fuzzy match 
		best_match = Match(crm_company_id = "",crm_company_name = "",crm_group_id = "", score="")
		best_score = 0

		best_match, best_score = call_fuzzy_match_generator(best_match, best_score, row["Company Name"].lower(), crm_results)

		## Test Prints
		# if best_score >= 75: 
		# 	#print ("keword_list: {} + crm_results count: {}".format(company_keywords_list,len(crm_results)))
		# 	print("{} => best_match: {} => '{}'".format(best_score, row["Company Name"], best_match))		
		print("{} => best_match: {}".format(best_score, best_match))
		
		
		### Generate new file
		df_cls.update_df(index, best_match, best_score)
예제 #2
0
    def test_ignore_words(self):
        """
        Ignore words. Returned keywords list are used to query similar companies in database to narrow number of records to apply fuzzy match against.
        """
        ignore_words_cls = IgnoreWords()


        self.assertEqual(sorted(["hada", "hada general trading","tradingl.l.c"]), sorted(ignore_words_cls.return_keyword_lists("Hada General TradingL.L.C".lower())))
        self.assertEqual(sorted(["dst","globalmiddle", "east", "dst globalmiddle east"]), sorted(ignore_words_cls.return_keyword_lists("Dst GlobalMiddle East Limited".lower())))
        self.assertEqual(sorted(["jacky's","jacky's gulf"]), sorted(ignore_words_cls.return_keyword_lists("Jacky's Gulf Fze".lower())))
        self.assertEqual(sorted(["emirates trading"]), sorted(ignore_words_cls.return_keyword_lists("Emirates Trading Est.".lower())))
        self.assertEqual(sorted(["mena","mena business services"]), sorted(ignore_words_cls.return_keyword_lists("Mena Business Services Fz-Llc".lower())))
        self.assertEqual(sorted(["shokri","hassan","shokri hassan trading"]), sorted(ignore_words_cls.return_keyword_lists("Shokri Hassan Trading Co. L.L. C.".lower())))
        self.assertEqual(sorted(["danube","bulding","danube bulding materials"]), sorted(ignore_words_cls.return_keyword_lists("Danube Bulding Materials Fzco.".lower())))
        self.assertEqual(sorted(["alokozay","alokozay international"]), sorted(ignore_words_cls.return_keyword_lists("Alokozay International Ltd.".lower())))
        self.assertEqual(sorted(["malcolm","pirnie","malcolm pirnie middle east"]), sorted(ignore_words_cls.return_keyword_lists("Malcolm Pirnie Middle East FZC".lower())))
        self.assertEqual(sorted(["ojaco","ojaco engineering"]), sorted(ignore_words_cls.return_keyword_lists("Ojaco Engineering Co.".lower())))
        self.assertEqual(sorted(["jaber","alec","al jaber l e g t engineering & contracting alec"]), sorted(ignore_words_cls.return_keyword_lists("Al Jaber L E G T Engineering & Contracting Alec L L C".lower())))
        self.assertEqual(sorted(["arabtec","arabtec holding"]), sorted(ignore_words_cls.return_keyword_lists("Arabtec Holding PJSC".lower())))
        self.assertEqual(sorted(["advanced","pipes","casts","advanced pipes and casts company"]), sorted(ignore_words_cls.return_keyword_lists("Advanced Pipes and Casts Company W.L.L.".lower())))
        self.assertEqual(sorted(["smith","smith international"]), sorted(ignore_words_cls.return_keyword_lists("Smith International Inc.".lower())))
        self.assertEqual(sorted(["thyssenkrupp","xervon","thyssenkrupp xervon u.a.e."]), sorted(ignore_words_cls.return_keyword_lists("ThyssenKrupp Xervon U.A.E. L.L.C.".lower())))
        self.assertEqual(sorted(["noor","al noor hospitals group",]), sorted(ignore_words_cls.return_keyword_lists("Al Noor Hospitals Group PLC".lower())))
        self.assertEqual(sorted(["g.i.t"]), sorted(ignore_words_cls.return_keyword_lists("G.I.T Fze".lower())))
        self.assertEqual(sorted(["linde","linde engineering middle east",]), sorted(ignore_words_cls.return_keyword_lists("Linde Engineering Middle East LLC".lower())))
        self.assertEqual(sorted(["emco","maintenance","engineering maintenance company emco"]), sorted(ignore_words_cls.return_keyword_lists("Engineering Maintenance Company EMCO".lower())))
        self.assertEqual(sorted(["moherbie","thermoplast","al moherbie thermoplast"]), sorted(ignore_words_cls.return_keyword_lists("Al Moherbie Thermoplast LLC".lower())))
        self.assertEqual(sorted(["gibca","gibtek", "gibca information technology gibtek"]), sorted(ignore_words_cls.return_keyword_lists("Gibca Information Technology L L C Gibtek".lower())))
        self.assertEqual(sorted(["y&r","y&r abu dhabi"]), sorted(ignore_words_cls.return_keyword_lists("Y&R Abu Dhabi".lower())))
        self.assertEqual(sorted(["tolico","tolico trading oilfield services"]), sorted(ignore_words_cls.return_keyword_lists("Tolico Trading Oilfield Services L L C".lower())))
예제 #3
0
    def test_ignore_words(self):
        """
        Ignore words. Returned keywords list are used to query similar companies in database to narrow number of records to apply fuzzy match against.
        """
        ignore_words_cls = IgnoreWords()

        self.assertEqual(
            sorted(["hada", "hada general trading", "tradingl.l.c"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Hada General TradingL.L.C".lower())))
        self.assertEqual(
            sorted(["dst", "globalmiddle", "east", "dst globalmiddle east"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Dst GlobalMiddle East Limited".lower())))
        self.assertEqual(
            sorted(["jacky's", "jacky's gulf"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Jacky's Gulf Fze".lower())))
        self.assertEqual(
            sorted(["emirates trading"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Emirates Trading Est.".lower())))
        self.assertEqual(
            sorted(["mena", "mena business services"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Mena Business Services Fz-Llc".lower())))
        self.assertEqual(
            sorted(["shokri", "hassan", "shokri hassan trading"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Shokri Hassan Trading Co. L.L. C.".lower())))
        self.assertEqual(
            sorted(["danube", "bulding", "danube bulding materials"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Danube Bulding Materials Fzco.".lower())))
        self.assertEqual(
            sorted(["alokozay", "alokozay international"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Alokozay International Ltd.".lower())))
        self.assertEqual(
            sorted(["malcolm", "pirnie", "malcolm pirnie middle east"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Malcolm Pirnie Middle East FZC".lower())))
        self.assertEqual(
            sorted(["ojaco", "ojaco engineering"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Ojaco Engineering Co.".lower())))
        self.assertEqual(
            sorted([
                "jaber", "alec",
                "al jaber l e g t engineering & contracting alec"
            ]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Al Jaber L E G T Engineering & Contracting Alec L L C".
                    lower())))
        self.assertEqual(
            sorted(["arabtec", "arabtec holding"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Arabtec Holding PJSC".lower())))
        self.assertEqual(
            sorted([
                "advanced", "pipes", "casts",
                "advanced pipes and casts company"
            ]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Advanced Pipes and Casts Company W.L.L.".lower())))
        self.assertEqual(
            sorted(["smith", "smith international"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Smith International Inc.".lower())))
        self.assertEqual(
            sorted(["thyssenkrupp", "xervon", "thyssenkrupp xervon u.a.e."]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "ThyssenKrupp Xervon U.A.E. L.L.C.".lower())))
        self.assertEqual(
            sorted([
                "noor",
                "al noor hospitals group",
            ]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Al Noor Hospitals Group PLC".lower())))
        self.assertEqual(
            sorted(["g.i.t"]),
            sorted(ignore_words_cls.return_keyword_lists("G.I.T Fze".lower())))
        self.assertEqual(
            sorted([
                "linde",
                "linde engineering middle east",
            ]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Linde Engineering Middle East LLC".lower())))
        self.assertEqual(
            sorted([
                "emco", "maintenance", "engineering maintenance company emco"
            ]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Engineering Maintenance Company EMCO".lower())))
        self.assertEqual(
            sorted(["moherbie", "thermoplast", "al moherbie thermoplast"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Al Moherbie Thermoplast LLC".lower())))
        self.assertEqual(
            sorted(["gibca", "gibtek", "gibca information technology gibtek"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Gibca Information Technology L L C Gibtek".lower())))
        self.assertEqual(
            sorted(["y&r", "y&r abu dhabi"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Y&R Abu Dhabi".lower())))
        self.assertEqual(
            sorted(["tolico", "tolico trading oilfield services"]),
            sorted(
                ignore_words_cls.return_keyword_lists(
                    "Tolico Trading Oilfield Services L L C".lower())))