示例#1
0
def word_match_share(row):
    swords = get_stop_words()
    q1set = set(
        [word for word in str(row['question1']).split() if word not in swords])
    q2set = set(
        [word for word in str(row['question2']).split() if word not in swords])
    return len(q1set.intersection(q2set))
示例#2
0
def calculate_distance(q1: str, q2: str, model: gensim.models.KeyedVectors):
    swords = get_stop_words()
    q1 = [word for word in str(q1).lower().split() if word not in swords and word.isalpha()]
    q2 = [word for word in str(q2).lower().split() if word not in swords and word.isalpha()]

    wq1 = []
    wq2 = []
    for word in q1:
        try:
            wq1.append(model[word])
        except:
            continue
    for word in q2:
        try:
            wq2.append(model[word])
        except:
            continue

    maximum = 0
    for q1_, w1 in zip(q1, wq1):
        minimum = 1e10
        for q2_, w2 in zip(q2, wq2):
            minimum = min(minimum, euclidean(w1, w2))
        maximum = max(maximum, minimum)

    for w2 in wq2:
        minimum = 1e10
        for w1 in wq1:
            minimum = min(minimum, euclidean(w1, w2))
        maximum = max(maximum, minimum)
    return maximum
示例#3
0
 def __init__(self, options):
     super().__init__(options)
     self.count_vectorizer = CountVectorizer(max_df=0.5,
                                             min_df=4,
                                             ngram_range=(3, 3))
     self.tfidf_transformer = TfidfTransformer(norm=None)
     self.stop_words = get_stop_words()
示例#4
0
 def __init__(self, options):
     super().__init__(options)
     self.vectorizer = TfidfVectorizer(max_df=0.5,
                                       min_df=8,
                                       ngram_range=(5, 5),
                                       analyzer='char')
     self.stop_words = get_stop_words()
示例#5
0
 def calculate_feature(self, data):
     data = pd.read_csv(data)
     values = []
     swords = get_stop_words()
     for i, row in tqdm(data.iterrows()):
         values.append(self._calculate_feature(row=row, swords=swords))
     data[self.feature_name] = values
     return data[[self.feature_name]]
示例#6
0
 def __init__(self, options):
     super().__init__(options)
     self.vectorizer = TfidfVectorizer(max_df=0.5,
                                       min_df=8,
                                       ngram_range=(1, 1),
                                       binary=True,
                                       stop_words=["'s"])
     self.stop_words = get_stop_words()
 def __init__(self,
              options,
              max_df=0.5,
              min_df=20,
              ngram_range=(1, 1),
              binary=True):
     super().__init__(options)
     self.vectorizer = CountVectorizer(max_df=max_df,
                                       min_df=min_df,
                                       ngram_range=ngram_range,
                                       binary=binary,
                                       analyzer='char')
     self.stop_words = get_stop_words()
     self.read_func = pd.read_csv
def _sentence2vec(s: str, model):
    swords = get_stop_words()
    words = str(s).lower().split()
    words = [w for w in words if w not in swords]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v**2).sum())
def nltk_stemming(data_file):
    stemmer = nltk.stem.PorterStemmer()
    swords = get_stop_words()
    cache_file = data_file + '.stemmed'
    if os.path.exists(cache_file):
        return pd.read_csv(cache_file)

    data = nltk_tokenize(data_file)
    data['question1'] = data['question1'].apply(lambda s: " ".join(
        stemming_words(str(s).split(), stopwords=swords, stemmer=stemmer))
                                                ).values
    data['question2'] = data['question2'].apply(lambda s: " ".join(
        stemming_words(str(s).split(), stopwords=swords, stemmer=stemmer))
                                                ).values
    data.to_csv(cache_file, index=False)
    return data
示例#10
0
def word_match_share(row):
    q1words = {}
    q2words = {}
    swords = get_stop_words()

    for word in str(row['question1']).lower().split():
        if word not in swords:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in swords:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    return (len(shared_words_in_q1) +
            len(shared_words_in_q2)) / (len(q1words) + len(q2words))
    def calculate_row_feature(self, row_):
        row = row_[1]
        swords = get_stop_words()
        q1 = [
            word for word in str(row['question1']).split()
            if word not in swords
        ]
        q2 = [
            word for word in str(row['question2']).split()
            if word not in swords
        ]

        wq1 = []
        wq2 = []
        for word in q1:
            try:
                wq1.append(self.model[word])
            except:
                continue
        for word in q2:
            try:
                wq2.append(self.model[word])
            except:
                continue

        distance = np.zeros((len(wq1), len(wq2)))
        for i1, w1 in enumerate(wq1):
            for i2, w2 in enumerate(wq2):
                distance[i1, i2] = np.dot(w1, w2)

        maximum = 0
        for i1 in range(len(wq1)):
            minimum = 1e10
            for i2 in range(len(wq2)):
                minimum = min(minimum, distance[i1, i2])
            maximum = max(maximum, minimum)

        for i2 in range(len(wq2)):
            minimum = 1e10
            for i1 in range(len(wq1)):
                minimum = min(minimum, distance[i1, i2])
            maximum = max(maximum, minimum)
        return maximum
示例#12
0
 def __init__(self, options, ngram_range=(1, 1)):
     super().__init__(options)
     self.vectorizer = TfidfVectorizer(max_df=0.5,
                                       min_df=8,
                                       ngram_range=ngram_range)
     self.stop_words = get_stop_words()
示例#13
0
def wmd(row, model: gensim.models.KeyedVectors):
    swords = get_stop_words()
    q1 = [word for word in str(row['question1']).split() if word not in swords]
    q2 = [word for word in str(row['question2']).split() if word not in swords]
    return model.wmdistance(q1, q2)